MetaData-Spring2018/MetadataAnalyser.py at master · MusicExMachina/MetaData-Spring2018 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#Takes a long time to load
model = gensim.models.KeyedVectors.load_word2vec_format(r'D:\MxM-Project\GoogleNews-vectors-negative300.bin', binary = True)
vocab = model.vocab.keys()


#the word 'of' is not in the vocabulary


sentence = ["(London)", ".", "'",   "/" , " \" " , "!" , "is", "the", "capital", "of" ,"Great", "Britain"]

#In order to maximize the number of words that will be put into the Vector model, we will clear the words of any extraneous syntax such as parentheses, that would otherwise prevent the word from being used
counter = 0
for clean in sentence:
    for each in clean:
        if each == "(":
            clean = clean.strip("(")
        if each == ")":
            clean = clean.strip(")")
        if each == ",":
            clean = clean.strip(",")
        if each == "[":
            clean = clean.strip("[")
        if each == "]":
            clean = clean.strip("]")
        if each == "?":
            clean = clean.strip("?")
        if each == "-":
            clean = clean.strip("-")
        if each == ".":
            clean = clean.strip(".")
        if each == "'":
            clean = clean.strip("'")
        if each == "/":
            clean = clean.strip("/")
        if each == '"':
            clean = clean.strip('"')
        if each == "!":
            clean = clean.strip("!")

    sentence[counter] = clean
    counter = counter + 1

print(sentence)


vectors = []
for w in sentence:
    if w in vocab:
        vectors.append(model[w])
    else:
        print("Word '{}' not in vocab".format(w))
        vectors.append([0])

print(len(vectors))


#vectors = [model[w] for w in sentence]
#print(len(vectors))
#print(vectors[0]) This would print out an array of numbers for a single word


#First we put all of the songs' titles, composers, etc into a single respective array

file = 'D:\\MxM-Project\\songFinal.txt'
#D:\MxM-Project
f = open(file, 'r')

songsWithAllInfo = []
songTitle = []
singerName = []
albumName = []
for song in f:
    enterRemover = song.rstrip("\n")
    enterRemover = enterRemover.split("\t")
    songTitle.append(enterRemover[0])
    singerName.append(enterRemover[1])
    albumName.append(enterRemover[2])
    #print(enterRemover.split("\t"))
    songsWithAllInfo.append(enterRemover)


#print(songsWithAllInfo[0][0])
#songsWithAllInfo[song][category]
'''
Categories:
1. Song Title
2. Singer
3. Album (May be same as song title if it is a single)
'''


#print(model.most_similar(positive=['woman', 'king'], negative=['man']))

#print(model.doesnt_match("breakfast cereal dinner lunch".split()))

#print(model.similarity('woman', 'man'))


#vocab = model.vocab.keys()


#model.score(["The fox jumped over a lazy dog".split()])

'''
fileNum = 1

wordsInVocab = len(vocab)
wordsPerFile = int(100E3)

# Write out the words in 100k chunks.
for wordIndex in range(0, wordsInVocab, wordsPerFile):
    # Write out the chunk to a numbered text file.
    with open("vocabulary/vocabulary_%.2d.txt" % fileNum, 'w') as f:
        # For each word in the current chunk...
        for i in range(wordIndex, wordIndex + wordsPerFile):
            # Write it out and escape any unicode characters.
            f.write(vocab[i].encode('UTF-8') + '\n')

    fileNum += 1

'''