VectorSearch-1/progenerate.py at master · SdDevX/VectorSearch-1 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# Gererate Embeddings #

# Import Packages
import requests
import openai
from mongo import collection
import env as e

# Get Env Variables
hf_token = e.hf_token
hf_embedding_url = e.hf_embedding_url
openai_api_key = e.openai_api_key
openai_model = e.openai_model
model = e.model
field = e.field

# Set OpenAI API Key
openai.api_key = openai_api_key

# Extract Value Helper Function
def extract_value(doc, path, default="NA"):
    # Return nested field safely
    keys = path.split('.')
    for key in keys:
        if isinstance(doc, dict):
            if key in doc:
                doc = doc[key]
            else:
                return default
        elif isinstance(doc, list) and key.isdigit():
            index = int(key)
            if 0 <= index < len(doc):
                doc = doc[index]
            else:
                return default
        else:
            return default
    return doc


# Object Counter
counter = 0

# Serialize Document Function
def serialize_document(doc) -> str:
    # Serialize all fields of the document
    text_representation = f"""
    _id: {extract_value(doc, '_id', 'NA')}
    Plot: {extract_value(doc, 'plot', 'NA')}
    Genres: {', '.join(extract_value(doc, 'genres', ['NA']))}
    Runtime: {extract_value(doc, 'runtime', 'NA')} minutes
    Rated: {extract_value(doc, 'rated', 'NA')}
    Cast: {', '.join(extract_value(doc, 'cast', ['NA']))}
    Num Mflix Comments: {extract_value(doc, 'num_mflix_comments', 'NA')}
    Poster: {extract_value(doc, 'poster', 'NA')}
    Title: {extract_value(doc, 'title', 'NA')}
    Full Plot: {extract_value(doc, 'fullplot', 'NA')}
    Languages: {', '.join(extract_value(doc, 'languages', ['NA']))}
    Released: {extract_value(doc, 'released', 'NA')}
    Directors: {', '.join(extract_value(doc, 'directors', ['NA']))}
    Writers: {', '.join(extract_value(doc, 'writers', ['NA']))}
    Awards Wins: {extract_value(doc, 'awards.wins', 'NA')}
    Awards Nominations: {extract_value(doc, 'awards.nominations', 'NA')}
    Awards Text: {extract_value(doc, 'awards.text', 'NA')}
    Last Updated: {extract_value(doc, 'lastupdated', 'NA')}
    Year: {extract_value(doc, 'year', 'NA')}
    IMDB Rating: {extract_value(doc, 'imdb.rating', 'NA')}
    IMDB Votes: {extract_value(doc, 'imdb.votes', 'NA')}
    IMDB ID: {extract_value(doc, 'imdb.id', 'NA')}
    Countries: {', '.join(extract_value(doc, 'countries', ['NA']))}
    Type: {extract_value(doc, 'type', 'NA')}
    Tomatoes Viewer Rating: {extract_value(doc, 'tomatoes.viewer.rating', 'NA')}
    Tomatoes Viewer Reviews: {extract_value(doc, 'tomatoes.viewer.numReviews', 'NA')}
    Tomatoes Critic Rating: {extract_value(doc, 'tomatoes.critic.rating', 'NA')}
    Tomatoes Critic Reviews: {extract_value(doc, 'tomatoes.critic.numReviews', 'NA')}
    Tomatoes Consensus: {extract_value(doc, 'tomatoes.consensus', 'NA')}
    Tomatoes Rotten: {extract_value(doc, 'tomatoes.rotten', 'NA')}
    Tomatoes Fresh: {extract_value(doc, 'tomatoes.fresh', 'NA')}
    Tomatoes Last Updated: {extract_value(doc, 'tomatoes.lastUpdated', 'NA')}
    Production: {extract_value(doc, 'tomatoes.production', 'NA')}
    """

    # Remove leading and trailing whitespaces
    text = text_representation.strip()

    # Print Object Counter
    global counter
    counter += 1
    print(counter)

    # Return Text Representation
    return text


# Generate Embedding Function
def generate_embedding(text: str) -> list[float]:
    # Using Hugging Face Model
    if model == "hf":
        # Generate Embedding Response
        response = requests.post(
            hf_embedding_url,
            headers={"Authorization": f"Bearer {hf_token}"},
            json={"inputs": text},
        )

        # Check Response Status Code
        if response.status_code != 200:
            raise ValueError(
                f"Request failed with status code {response.status_code}: {response.text}"
            )

        # Return Embedding Response
        return response.json()

    # Using OpenAI Model
    else:
        # Generate Embedding Response
        response = openai.embeddings.create(model=openai_model, input=text)

        # Return Embedding Response
        return response.data[0].embedding


# Add Embedding To Collection Function
def add_embedding():
    # Get Documents
    documents = collection.find({"_id": {"$exists": True}}).limit(50)

    # Loop through each document
    for document in documents:
        # Serialize the entire document
        serialized_document = serialize_document(document)

        # Generate Embedding for the serialized document
        embedding = generate_embedding(serialized_document)

        # Update the document with the generated embedding
        collection.update_one(
            {"_id": document["_id"]},
            {"$set": {field: embedding}},
        )

    # Print Success Message
    print("Embeddings added successfully!")