-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMetadataAnalyser.py
More file actions
134 lines (93 loc) · 3.34 KB
/
MetadataAnalyser.py
File metadata and controls
134 lines (93 loc) · 3.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
#Takes a long time to load
model = gensim.models.KeyedVectors.load_word2vec_format(r'D:\MxM-Project\GoogleNews-vectors-negative300.bin', binary = True)
vocab = model.vocab.keys()
#the word 'of' is not in the vocabulary
sentence = ["(London)", ".", "'", "/" , " \" " , "!" , "is", "the", "capital", "of" ,"Great", "Britain"]
#In order to maximize the number of words that will be put into the Vector model, we will clear the words of any extraneous syntax such as parentheses, that would otherwise prevent the word from being used
counter = 0
for clean in sentence:
for each in clean:
if each == "(":
clean = clean.strip("(")
if each == ")":
clean = clean.strip(")")
if each == ",":
clean = clean.strip(",")
if each == "[":
clean = clean.strip("[")
if each == "]":
clean = clean.strip("]")
if each == "?":
clean = clean.strip("?")
if each == "-":
clean = clean.strip("-")
if each == ".":
clean = clean.strip(".")
if each == "'":
clean = clean.strip("'")
if each == "/":
clean = clean.strip("/")
if each == '"':
clean = clean.strip('"')
if each == "!":
clean = clean.strip("!")
sentence[counter] = clean
counter = counter + 1
print(sentence)
vectors = []
for w in sentence:
if w in vocab:
vectors.append(model[w])
else:
print("Word '{}' not in vocab".format(w))
vectors.append([0])
print(len(vectors))
#vectors = [model[w] for w in sentence]
#print(len(vectors))
#print(vectors[0]) This would print out an array of numbers for a single word
#First we put all of the songs' titles, composers, etc into a single respective array
file = 'D:\\MxM-Project\\songFinal.txt'
#D:\MxM-Project
f = open(file, 'r')
songsWithAllInfo = []
songTitle = []
singerName = []
albumName = []
for song in f:
enterRemover = song.rstrip("\n")
enterRemover = enterRemover.split("\t")
songTitle.append(enterRemover[0])
singerName.append(enterRemover[1])
albumName.append(enterRemover[2])
#print(enterRemover.split("\t"))
songsWithAllInfo.append(enterRemover)
#print(songsWithAllInfo[0][0])
#songsWithAllInfo[song][category]
'''
Categories:
1. Song Title
2. Singer
3. Album (May be same as song title if it is a single)
'''
#print(model.most_similar(positive=['woman', 'king'], negative=['man']))
#print(model.doesnt_match("breakfast cereal dinner lunch".split()))
#print(model.similarity('woman', 'man'))
#vocab = model.vocab.keys()
#model.score(["The fox jumped over a lazy dog".split()])
'''
fileNum = 1
wordsInVocab = len(vocab)
wordsPerFile = int(100E3)
# Write out the words in 100k chunks.
for wordIndex in range(0, wordsInVocab, wordsPerFile):
# Write out the chunk to a numbered text file.
with open("vocabulary/vocabulary_%.2d.txt" % fileNum, 'w') as f:
# For each word in the current chunk...
for i in range(wordIndex, wordIndex + wordsPerFile):
# Write it out and escape any unicode characters.
f.write(vocab[i].encode('UTF-8') + '\n')
fileNum += 1
'''