-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprogenerate.py
More file actions
144 lines (122 loc) · 4.78 KB
/
progenerate.py
File metadata and controls
144 lines (122 loc) · 4.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# Gererate Embeddings #
# Import Packages
import requests
import openai
from mongo import collection
import env as e
# Get Env Variables
hf_token = e.hf_token
hf_embedding_url = e.hf_embedding_url
openai_api_key = e.openai_api_key
openai_model = e.openai_model
model = e.model
field = e.field
# Set OpenAI API Key
openai.api_key = openai_api_key
# Extract Value Helper Function
def extract_value(doc, path, default="NA"):
# Return nested field safely
keys = path.split('.')
for key in keys:
if isinstance(doc, dict):
if key in doc:
doc = doc[key]
else:
return default
elif isinstance(doc, list) and key.isdigit():
index = int(key)
if 0 <= index < len(doc):
doc = doc[index]
else:
return default
else:
return default
return doc
# Object Counter
counter = 0
# Serialize Document Function
def serialize_document(doc) -> str:
# Serialize all fields of the document
text_representation = f"""
_id: {extract_value(doc, '_id', 'NA')}
Plot: {extract_value(doc, 'plot', 'NA')}
Genres: {', '.join(extract_value(doc, 'genres', ['NA']))}
Runtime: {extract_value(doc, 'runtime', 'NA')} minutes
Rated: {extract_value(doc, 'rated', 'NA')}
Cast: {', '.join(extract_value(doc, 'cast', ['NA']))}
Num Mflix Comments: {extract_value(doc, 'num_mflix_comments', 'NA')}
Poster: {extract_value(doc, 'poster', 'NA')}
Title: {extract_value(doc, 'title', 'NA')}
Full Plot: {extract_value(doc, 'fullplot', 'NA')}
Languages: {', '.join(extract_value(doc, 'languages', ['NA']))}
Released: {extract_value(doc, 'released', 'NA')}
Directors: {', '.join(extract_value(doc, 'directors', ['NA']))}
Writers: {', '.join(extract_value(doc, 'writers', ['NA']))}
Awards Wins: {extract_value(doc, 'awards.wins', 'NA')}
Awards Nominations: {extract_value(doc, 'awards.nominations', 'NA')}
Awards Text: {extract_value(doc, 'awards.text', 'NA')}
Last Updated: {extract_value(doc, 'lastupdated', 'NA')}
Year: {extract_value(doc, 'year', 'NA')}
IMDB Rating: {extract_value(doc, 'imdb.rating', 'NA')}
IMDB Votes: {extract_value(doc, 'imdb.votes', 'NA')}
IMDB ID: {extract_value(doc, 'imdb.id', 'NA')}
Countries: {', '.join(extract_value(doc, 'countries', ['NA']))}
Type: {extract_value(doc, 'type', 'NA')}
Tomatoes Viewer Rating: {extract_value(doc, 'tomatoes.viewer.rating', 'NA')}
Tomatoes Viewer Reviews: {extract_value(doc, 'tomatoes.viewer.numReviews', 'NA')}
Tomatoes Critic Rating: {extract_value(doc, 'tomatoes.critic.rating', 'NA')}
Tomatoes Critic Reviews: {extract_value(doc, 'tomatoes.critic.numReviews', 'NA')}
Tomatoes Consensus: {extract_value(doc, 'tomatoes.consensus', 'NA')}
Tomatoes Rotten: {extract_value(doc, 'tomatoes.rotten', 'NA')}
Tomatoes Fresh: {extract_value(doc, 'tomatoes.fresh', 'NA')}
Tomatoes Last Updated: {extract_value(doc, 'tomatoes.lastUpdated', 'NA')}
Production: {extract_value(doc, 'tomatoes.production', 'NA')}
"""
# Remove leading and trailing whitespaces
text = text_representation.strip()
# Print Object Counter
global counter
counter += 1
print(counter)
# Return Text Representation
return text
# Generate Embedding Function
def generate_embedding(text: str) -> list[float]:
# Using Hugging Face Model
if model == "hf":
# Generate Embedding Response
response = requests.post(
hf_embedding_url,
headers={"Authorization": f"Bearer {hf_token}"},
json={"inputs": text},
)
# Check Response Status Code
if response.status_code != 200:
raise ValueError(
f"Request failed with status code {response.status_code}: {response.text}"
)
# Return Embedding Response
return response.json()
# Using OpenAI Model
else:
# Generate Embedding Response
response = openai.embeddings.create(model=openai_model, input=text)
# Return Embedding Response
return response.data[0].embedding
# Add Embedding To Collection Function
def add_embedding():
# Get Documents
documents = collection.find({"_id": {"$exists": True}}).limit(50)
# Loop through each document
for document in documents:
# Serialize the entire document
serialized_document = serialize_document(document)
# Generate Embedding for the serialized document
embedding = generate_embedding(serialized_document)
# Update the document with the generated embedding
collection.update_one(
{"_id": document["_id"]},
{"$set": {field: embedding}},
)
# Print Success Message
print("Embeddings added successfully!")