Update model and add whitelist (#7)

noahjax · web-flow · commit fb44bcca287f · 2025-05-06T09:37:55.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 .filter
+.venv
 __pycache__
 build/
 dist/
diff --git a/demo.ipynb b/demo.ipynb
@@ -31,6 +31,7 @@
     "    \"election results\",\n",
     "    \"presidential election polls 2024\",\n",
     "    \"what books do you recommend\",\n",
+    "    \"sf weather tomorrow\"\n",
     "]\n",
     "\n",
     "preds = query_filter.predict(queries)\n",
@@ -41,7 +42,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".filter",
+   "display_name": ".venv",
    "language": "python",
    "name": "python3"
   },
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ authors = [
   {name = "Noah Jackson", email = "noah@trytako.com"},
 ]
 dependencies = [
-  "en-tako-query-filter @ https://huggingface.co/TakoData/en_tako_query_filter/resolve/0.0.1/en_tako_query_filter-any-py3-none-any.whl",
+  "en-tako-query-analyzer @ https://huggingface.co/TakoData/en_tako_query_analyzer/resolve/0.0.4/en_tako_query_analyzer-any-py3-none-any.whl",
   "ipykernel~=6.29.5",
   "jupyter~=1.1.1",
   "nbstripout~=0.7.1",
@@ -13,7 +13,7 @@ description = "Combines models to predict which queries Tako's API should handle
 name = "tako-query-filter"
 readme = "README.md"
 requires-python = ">=3.10.14,<3.13"
-version = "0.2.1"
+version = "0.3.0"
 
 [tool.setuptools.packages.find]
 include = ["tako_query_filter*"]
diff --git a/src/tako_query_filter/filter.py b/src/tako_query_filter/filter.py
@@ -1,19 +1,25 @@
+import hashlib
 import json
 import re
 from typing import Iterable, List
+
 import spacy
-import hashlib
+
 from tako_query_filter.keywords import keywords
+from tako_query_filter.whitelist import whitelist
 
 
 class TakoQueryFilter:
     def __init__(
         self,
         keyword_hashes: Iterable[str] = keywords,
+        whitelist_hashes: Iterable[str] = whitelist,
     ):
-        self.nlp = spacy.load("en_tako_query_filter")
+        self.nlp = spacy.load("en_tako_query_analyzer")
         self.keywords_hashes = set(keyword_hashes)
+        self.whitelist_hashes = set(whitelist_hashes)
         self.keyword_match_score = 0.9
+        self.whitelist_match_score = 0.8
 
     @classmethod
     def load_with_keywords(
@@ -38,30 +44,42 @@ def predict(
         queries: List[str],
     ) -> List[int]:
         probs = self.predict_proba(queries)
-        predictions = [1 if p > 0.5 else 0 for p in probs]
+        predictions = [1 if p > 0.3 else 0 for p in probs]
         return predictions
 
     def predict_proba(
         self,
         queries: List[str],
     ) -> List[float]:
-        preds = self.nlp.pipe(queries)
-
-        probs = []
-        for pred in preds:
-            accept = pred.cats["ACCEPT"]
-            reject = pred.cats["REJECT"]
-            # Just to be safe, normalize the probabilities
-            probs.append(accept / (accept + reject))
-
-        # Check keywords
-        for i, query in enumerate(queries):
-            split_query = self._split_query(query)
-            split_hashes = {self._hash_string(split) for split in split_query}
-            if any(split_hash in self.keywords_hashes for split_hash in split_hashes):
-                probs[i] = self.keyword_match_score
-
-        return probs
+        with self.nlp.select_pipes(enable=["tok2vec", "ner", "textcat_classify"]):
+            preds = self.nlp.pipe(queries)
+
+            probs = []
+            for pred in preds:
+                accept = pred.cats["ACCEPT"]
+                reject = pred.cats["REJECT"]
+                # Just to be safe, normalize the probabilities
+                probs.append(accept / (accept + reject))
+
+            # Check whitelist
+            for i, query in enumerate(queries):
+                split_query = query.lower().split()
+                if any(
+                    self._hash_string(split) in self.whitelist_hashes
+                    for split in split_query
+                ):
+                    probs[i] = self.whitelist_match_score
+
+            # Check keywords
+            for i, query in enumerate(queries):
+                split_query = self._split_query(query)
+                split_hashes = {self._hash_string(split) for split in split_query}
+                if any(
+                    split_hash in self.keywords_hashes for split_hash in split_hashes
+                ):
+                    probs[i] = self.keyword_match_score
+
+            return probs
 
     def _split_query(self, query: str) -> List[str]:
         split_keywords = ["vs", "vs.", "versus", "or", "and"]
diff --git a/src/tako_query_filter/keywords.py b/src/tako_query_filter/keywords.py
@@ -457286,5 +457286,5 @@
   "67dccfe9332640119fc7644463f5a40b",
   "44ef39be655ad5f53c05885d8959dbf3",
   "eaafc99a7ad0fa65d19d5cc7f76403bf",
-  "2092310ac8727c5cfedb4c2fcecac7f4"
+  "2092310ac8727c5cfedb4c2fcecac7f4",
 ]
diff --git a/src/tako_query_filter/whitelist.py b/src/tako_query_filter/whitelist.py
@@ -0,0 +1,10 @@
+whitelist = [
+    "aab92e69374e4c7b8c6741fe02e574b9",
+    "23678db5efde9ab46bce8c23a6d91b50",
+    "533c5ba8368075db8f6ef201546bd71a",
+    "f3639baeb4530db03ef930eb16073f61",
+    "2b93fbdf27d43547bec8794054c28e00",
+    "7e25b972e192b01004b62346ee9975a5",
+    "3811727de5b0ddf6ae30defe2ca4d2c2",
+    "559608508b42a01c1068fae4fcdc2aef",
+]

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`.filter`
	`2`	`+.venv`
`2`	`3`	`__pycache__`
`3`	`4`	`build/`
`4`	`5`	`dist/`
Original file line number	Diff line number	Diff line change
`@@ -457286,5 +457286,5 @@`
`457286`	`457286`	`"67dccfe9332640119fc7644463f5a40b",`
`457287`	`457287`	`"44ef39be655ad5f53c05885d8959dbf3",`
`457288`	`457288`	`"eaafc99a7ad0fa65d19d5cc7f76403bf",`
`457289`		`- "2092310ac8727c5cfedb4c2fcecac7f4"`
	`457289`	`+ "2092310ac8727c5cfedb4c2fcecac7f4",`
`457290`	`457290`	`]`