1- import os
2- import re
3- from typing import List , Dict , Set
1+ """Module for checking text for badwords."""
2+
3+ from __future__ import annotations
4+
45from difflib import SequenceMatcher
6+ from pathlib import Path
7+ from typing import Self
8+
9+ from .exceptions import NotSupportedLanguage
10+
511
612class ProfanityFilter :
7- """
8- A class for filtering profanity from text.
9- """
13+ """A class for filtering profanity from text."""
1014
11- def __init__ (self , languages : List [str ] = None , all_languages : bool = False ):
12- """
13- Initialize the profanity filter.
15+ async def init (self : Self ,
16+ languages : list [str ] | None = None ,
17+ ) -> None :
18+ """Initialize the profanity filter.
1419
1520 :param languages: List of languages to load profanity words for.
1621 :param all_languages: Flag to load profanity words for all available languages.
1722 """
18- self .script_dir = os .path .dirname (os .path .realpath (__file__ ))
19- self .language_files : Dict [str , str ] = self .initialize_language_files ()
20- self .languages = languages or list (self .language_files .keys ()) if all_languages else languages
21- self .bad_words : Dict [str , Set [str ]] = self .initialize_bad_words ()
22- self .patterns : Dict [str , re .Pattern ] = self .compile_patterns ()
23- self .custom_bad_words : Set [str ] = set ()
24-
25- def initialize_language_files (self ) -> Dict [str , str ]:
26- """
27- Initialize language files.
23+ self .resource_dir = Path (__file__ ).parent / "resource"
24+
25+ self .language_files = await self .initialize_language_files ()
26+
27+ if languages :
28+ if all (i in self .language_files for i in languages ):
29+ self .language_files = languages
30+ else :
31+ raise NotSupportedLanguage
32+
33+ self .bad_words = await self .initialize_bad_words ()
34+
35+ async def initialize_language_files (self : Self ) -> list [str ]:
36+ """Initialize language files.
2837
2938 :return: Dictionary mapping language names to file paths.
3039 """
31- resource_dir = os .path .join (self .script_dir , 'resource' )
32- return {os .path .splitext (filename )[0 ]: os .path .join (resource_dir , filename ) for filename in os .listdir (resource_dir )}
40+ return [str (path )[- 6 :- 4 ] for path in (self .resource_dir ).iterdir ()]
3341
34- def initialize_bad_words (self ) -> Dict [str , Set [str ]]:
35- """
36- Initialize profanity words for each language.
42+ async def initialize_bad_words (self : Self ) -> set [str ]:
43+ """Initialize profanity words for each language.
3744
3845 :return: Dictionary mapping language names to sets of profanity words.
3946 """
40- bad_words = {}
41- for language in self .languages :
42- file_path = self .language_files .get (language )
43- if file_path :
44- with open (file_path , 'r' , encoding = 'utf-8' ) as file :
45- bad_words [language ] = {line .strip () for line in file }
46- return bad_words
47+ bad_words = set ()
4748
48- def compile_patterns ( self ) -> Dict [ str , re . Pattern ] :
49- """
50- Compile regular expression patterns for profanity words.
49+ for lang in self . language_files :
50+ with ( self . resource_dir / f" { lang } .bdw" ). open ( encoding = "utf-8" ) as f :
51+ bad_words . update ( f . read (). split ())
5152
52- :return: Dictionary mapping language names to compiled regex patterns.
53- """
54- return {language : re .compile (r'\b(?:' + '|' .join (map (re .escape , words )) + r')\b' , re .IGNORECASE ) for language , words in self .bad_words .items ()}
53+ return bad_words
5554
56- def add_words (self , words : List [str ]):
57- """
58- Add custom profanity words to the filter.
55+ async def add_words (self : Self , words : list [str ]) -> None :
56+ """Add custom profanity words to the filter.
5957
6058 :param words: List of custom profanity words.
6159 """
62- self .custom_bad_words .update (words )
60+ self .bad_words .update (words )
6361
64- def similar (self , a : str , b : str ) -> float :
65- """
66- Compute similarity ratio between two strings.
62+ async def similar (self : Self , a : str , b : str ) -> float :
63+ """Compute similarity ratio between two strings.
6764
6865 :param a: First string.
6966 :param b: Second string.
7067 :return: Similarity ratio.
7168 """
7269 return SequenceMatcher (None , a , b ).ratio ()
7370
74- def filter_text (self , text : str , match_threshold : float = 0.8 , replace_character = None ):
75- """
76- Check if the given text contains profanity.
71+ async def filter_text (
72+ self : Self , text : str ,
73+ match_threshold : float | None = None ,
74+ replace_character : str | None = None ,
75+ ) -> bool | str :
76+ """Check if the given text contains profanity.
7777
7878 :param text: Input text to check.
7979 :param match_threshold: Threshold for similarity match.
80- :param replace_character: Character to replace profane words with. If None, return True/False.
81- :return: True if profanity found, False otherwise. If replace_character is specified, return filtered text.
80+ :param replace_character: Character to replace profane words with. If None,
81+ return True/False.
82+ :return: True if profanity found, False otherwise. If replace_character is
83+ specified, return filtered text.
8284 """
83- all_bad_words = set .union (self .custom_bad_words , * self .bad_words .values ())
84-
85- words_in_text = text .lower ().split (' ' )
86- filtered_text = text .lower ()
87- for word in words_in_text :
88- for bad_word in all_bad_words :
89- if self .similar (word , bad_word ) > match_threshold :
90- if replace_character is not None :
91- filtered_text = filtered_text .replace (word , replace_character * len (word ))
92- else :
93- return True if replace_character is None else filtered_text
94- return False if replace_character is None else filtered_text
95-
96- def get_all_languages (self ) -> List [str ]:
97- """
98- Get a list of all available languages.
85+ if not match_threshold :
86+ match_threshold = 1
87+
88+ text = text .lower ()
89+
90+ for word in text .split ():
91+ if word in self .bad_words :
92+ return True
93+
94+ if 0 < match_threshold < 1 :
95+ for bad_word in self .bad_words :
96+ if await self .similar (word , bad_word ) > match_threshold :
97+ if replace_character :
98+ return text .replace (word )
99+ return True
100+
101+ return False
102+
103+ async def get_all_languages (self : Self ) -> list [str ]:
104+ """Get a list of all available languages.
99105
100106 :return: List of all language names.
101107 """
102- return list ( self .language_files . keys ())
108+ return self .language_files
0 commit comments