DataQuery/qdrant_cache.py at main · eresh-mittal/DataQuery · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
import json
from llama_index.core.schema import Document
from typing import Optional, Dict, Any

class QdrantQueryCache:
    def __init__(self, embed_model, vector_store):
        """Initialize Qdrant vector store with LlamaIndex

        Args:
            embed_model: Embedding model to use
            client: Initialized Qdrant client
            collection_name: Name of the Qdrant collection to use
        """
        self.embedding = embed_model

        # Create storage context with the vector store
        self.storage_context = StorageContext.from_defaults(
            vector_store=vector_store
        )

        # Create index with the storage context
        self.index = VectorStoreIndex(
            [],
            storage_context=self.storage_context,
            embed_model=self.embedding
        )

    def store_query_mapping(
        self,
        query: str,
        sql_query: str,
        result: str,
        chart_data: Dict[str, Any]
    ) -> None:
        """Store query, SQL, and result in Qdrant vector store

        Args:
            query: Original natural language query
            sql_query: Generated SQL query
            result: Query execution result
            chart_data: Visualization data dictionary
        """
        # Create a document node with query details
        node = Document(
            text=f"""
            Query: {query}
            SQL: {sql_query}
            Result: {result}
            """,
            metadata={
                "original_query": query,
                "sql_query": sql_query,
                "result": result,
                "chart_data": chart_data
            }
        )

        # Insert the node into the index
        self.index.insert_nodes([node])

    def retrieve_similar_query(self, query: str, top_k: int = 1, llm = None) -> Optional[Dict]:
        """
        Retrieve similar previous queries with LLM-based similarity evaluation

        Args:
            query (str): Current user query
            top_k (int): Number of t
        Retrieve similar previous queries with LLM-based similarity evaluation

        Args:
            query (str): Current user query
            top_k (int): Number of toop similar queries to retrieve

        Returns:
            Dict or None: Metadata of the most similar query if confidence is high
        """
        # First, retrieve similar nodes using vector similarity

        if query is None:
            return None
        retriever = self.index.as_retriever(similarity_top_k=top_k)
        retrieved_nodes = retriever.retrieve(query)

        if not retrieved_nodes:
            return None

        # Use LLM to evaluate query similarity
        for node in retrieved_nodes:
            similarity_prompt = f"""
            Evaluate the similarity between these two queries:

            Current Query: "{query}"
            Cached Query: "{node.metadata['original_query']}"

            Assess their semantic similarity and relevance. Consider:
            1. Intent and purpose of the queries
            2. If the underlying data requirements change, the scores should be low since the output will change
            3. Potential SQL generation differences resulting in low scores

            Respond with a JSON with the following fields:
            - SimilarityScore (0-1): How semantically similar are the queries?
            - Confidence (0-1): How confident are you in your similarity assessment?
            - Reason: Reason for the corresponding Similarity score and confidence score in a list of strings.
            """

            try:
                response = llm.complete(similarity_prompt)
                response_text = response.text if hasattr(response, 'text') else str(response)
                print("Similarity Index:    \n", response_text)

                similarity_data = json.loads(response_text)
                # print(similarity_data)
                similarity_score = similarity_data["SimilarityScore"]
                confidence = similarity_data["Confidence"]

                # Return cached result if similarity is high and confidence is strong
                if similarity_score >= 0.9 and confidence >= 0.9:
                    # print(node)
                    return node.metadata

            except Exception as e:
                print(f"Error evaluating query similarity: {e}")

        return None