Skip to content

Commit 14cb4a2

Browse files
authored
Merge pull request #1 from iamlostshe/master
Many updates
2 parents a2865eb + 74f4d8a commit 14cb4a2

8 files changed

Lines changed: 3688 additions & 1919 deletions

File tree

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
11
.libtest
22
.vscode
3-
__pycache__/
3+
.python-version
4+
__pycache__/
5+
.ruff_cache
6+
.mypy_cache
7+
*test*

README.md

Lines changed: 44 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -41,24 +41,34 @@ pip3 install git+https://github.com/FlacSy/badwords.git
4141
### Инициализация
4242

4343
```python
44-
ProfanityFilter(languages: List[str] = None, all_languages: bool = False)
44+
p = ProfanityFilter()
45+
46+
p.init(languages: List[str] | None = None)
4547
```
4648

4749
#### Параметры
4850

49-
- `languages` (список строк, необязательно): Список языков, для которых будут загружены слова нецензурной лексики. Если не указано, будут использованы все доступные языки при установке флага `all_languages` в `True`.
50-
- `all_languages` (логическое значение, необязательно): Флаг для загрузки слов нецензурной лексики для всех доступных языков. По умолчанию `False`.
51+
- `languages` (список строк, необязательно): Список языков, для которых будут загружены слова нецензурной лексики. Если не указано, будут использованы все доступные языки.
5152

5253
### Примеры использования
5354

5455
```python
56+
import asyncio
57+
5558
from badwords import ProfanityFilter
5659

57-
# Инициализация с использованием английского и испанского языков
58-
my_filter = ProfanityFilter(languages=['en', 'sp'])
5960

60-
# Инициализация с использованием всех доступных языков
61-
my_filter_all = ProfanityFilter(all_languages=True)
61+
async def main() -> None:
62+
# Инициализация с использованием английского и испанского языков
63+
_filter = ProfanityFilter()
64+
await _filter.init(["en", "sp"])
65+
66+
# Инициализация с использованием всех доступных языков
67+
await _filter.init()
68+
69+
70+
if __name__ == "__main__":
71+
asyncio.run(main())
6272
```
6373

6474
### Методы
@@ -74,7 +84,7 @@ my_filter_all = ProfanityFilter(all_languages=True)
7484
##### Пример
7585

7686
```python
77-
language_files = my_filter.initialize_language_files()
87+
language_files = await _filter.initialize_language_files()
7888
print(language_files)
7989
```
8090

@@ -89,25 +99,10 @@ print(language_files)
8999
##### Пример
90100

91101
```python
92-
bad_words = my_filter.initialize_bad_words()
102+
bad_words = await _filter.initialize_bad_words()
93103
print(bad_words)
94104
```
95105

96-
#### `compile_patterns()`
97-
98-
Компиляция регулярных выражений для слов нецензурной лексики.
99-
100-
##### Возвращаемое значение
101-
102-
- Словарь, который сопоставляет имена языков с компилированными регулярными выражениями.
103-
104-
##### Пример
105-
106-
```python
107-
patterns = my_filter.compile_patterns()
108-
print(patterns)
109-
```
110-
111106
#### `add_words(words: List[str])`
112107

113108
Добавление пользовательских слов нецензурной лексики в фильтр.
@@ -119,7 +114,7 @@ print(patterns)
119114
##### Пример
120115

121116
```python
122-
my_filter.add_words(['customword1', 'customword2'])
117+
await _filter.add_words(["customword1", "customword2"])
123118
```
124119

125120
#### `similar(a: str, b: str)`
@@ -153,11 +148,11 @@ my_filter.add_words(['customword1', 'customword2'])
153148

154149
```python
155150
# Проверка на наличие нецензурной лексики
156-
contains_profanity = my_filter.filter_text('This is some bad text', match_threshold=0.9)
151+
contains_profanity = await _filter.filter_text("This is some bad text", match_threshold=0.9)
157152
print(contains_profanity) # True или False
158153

159154
# Проверка на наличие нецензурной лексики с заменой
160-
filtered_text = my_filter.filter_text('This is some bad text', replace_character='*')
155+
filtered_text = await _filter.filter_text("This is some bad text", replace_character="*")
161156
print(filtered_text) # Текст с заменёнными непристойными словами
162157
```
163158

@@ -172,8 +167,8 @@ print(filtered_text) # Текст с заменёнными непристой
172167
##### Пример
173168

174169
```python
175-
all_languages = my_filter.get_all_languages()
176-
print(all_languages) # ['en', 'sp', 'fr', 'de', ...]
170+
all_languages = await _filter.get_all_languages()
171+
print(all_languages) # ["en", "sp", "fr", "de", ...]
177172
```
178173

179174
## Поддерживаемые языки
@@ -209,16 +204,26 @@ print(all_languages) # ['en', 'sp', 'fr', 'de', ...]
209204
## Полный пример использования
210205

211206
```python
207+
import asyncio
208+
212209
from badwords import ProfanityFilter
213210

214-
# Создаем экземпляр фильтра, указывая нужные языки
215-
my_filter = ProfanityFilter(languages=['en', 'sp'])
211+
212+
async def main() -> None:
213+
# Создаем экземпляр фильтра, указывая нужные языки
214+
_filter = ProfanityFilter()
215+
await _filter.init(["en", "sp"])
216+
217+
text ="Text with inappropriate words"
218+
219+
await check_profanity(_filter, text)
220+
await check_profanity_with_replace(_filter, text)
216221

217222
# Функция для проверки текста на наличие нецензурной лексики
218-
def check_profanity(text: str):
219-
result = my_filter.filter_text(
223+
async def check_profanity(_filter: ProfanityFilter, text: str) -> None:
224+
result = await _filter.filter_text(
220225
text=text,
221-
match_threshold=0.9
226+
match_threshold=0.9,
222227
)
223228

224229
if result:
@@ -227,16 +232,15 @@ def check_profanity(text: str):
227232
print("Этот текст не содержит нецензурной лексики.")
228233

229234
# Функция для проверки текста на наличие нецензурной лексики с заменой
230-
def check_profanity_with_replace(text: str):
231-
result = my_filter.filter_text(
235+
async def check_profanity_with_replace(_filter: ProfanityFilter, text: str) -> str:
236+
result = await _filter.filter_text(
232237
text=text,
233238
match_threshold=0.8,
234-
replace_character="*"
239+
replace_character="*",
235240
)
236241

237242
print(result)
238243

239244
if __name__ == "__main__":
240-
check_profanity("Text with inappropriate words")
241-
check_profanity_with_replace("Text with inappropriate words")
242-
```
245+
asyncio.run(main())
246+
```

badwords/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""A library for effective moderation of content."""
2+
13
from .check import ProfanityFilter
24

3-
__all__ = ['ProfanityFilter']
5+
__all__ = ["ProfanityFilter"]

badwords/check.py

Lines changed: 72 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,102 +1,108 @@
1-
import os
2-
import re
3-
from typing import List, Dict, Set
1+
"""Module for checking text for badwords."""
2+
3+
from __future__ import annotations
4+
45
from difflib import SequenceMatcher
6+
from pathlib import Path
7+
from typing import Self
8+
9+
from .exceptions import NotSupportedLanguage
10+
511

612
class ProfanityFilter:
7-
"""
8-
A class for filtering profanity from text.
9-
"""
13+
"""A class for filtering profanity from text."""
1014

11-
def __init__(self, languages: List[str] = None, all_languages: bool = False):
12-
"""
13-
Initialize the profanity filter.
15+
async def init(self: Self,
16+
languages: list[str] | None = None,
17+
) -> None:
18+
"""Initialize the profanity filter.
1419
1520
:param languages: List of languages to load profanity words for.
1621
:param all_languages: Flag to load profanity words for all available languages.
1722
"""
18-
self.script_dir = os.path.dirname(os.path.realpath(__file__))
19-
self.language_files: Dict[str, str] = self.initialize_language_files()
20-
self.languages = languages or list(self.language_files.keys()) if all_languages else languages
21-
self.bad_words: Dict[str, Set[str]] = self.initialize_bad_words()
22-
self.patterns: Dict[str, re.Pattern] = self.compile_patterns()
23-
self.custom_bad_words: Set[str] = set()
24-
25-
def initialize_language_files(self) -> Dict[str, str]:
26-
"""
27-
Initialize language files.
23+
self.resource_dir = Path(__file__).parent / "resource"
24+
25+
self.language_files = await self.initialize_language_files()
26+
27+
if languages:
28+
if all(i in self.language_files for i in languages):
29+
self.language_files = languages
30+
else:
31+
raise NotSupportedLanguage
32+
33+
self.bad_words = await self.initialize_bad_words()
34+
35+
async def initialize_language_files(self: Self) -> list[str]:
36+
"""Initialize language files.
2837
2938
:return: Dictionary mapping language names to file paths.
3039
"""
31-
resource_dir = os.path.join(self.script_dir, 'resource')
32-
return {os.path.splitext(filename)[0]: os.path.join(resource_dir, filename) for filename in os.listdir(resource_dir)}
40+
return [str(path)[-6:-4] for path in (self.resource_dir).iterdir()]
3341

34-
def initialize_bad_words(self) -> Dict[str, Set[str]]:
35-
"""
36-
Initialize profanity words for each language.
42+
async def initialize_bad_words(self: Self) -> set[str]:
43+
"""Initialize profanity words for each language.
3744
3845
:return: Dictionary mapping language names to sets of profanity words.
3946
"""
40-
bad_words = {}
41-
for language in self.languages:
42-
file_path = self.language_files.get(language)
43-
if file_path:
44-
with open(file_path, 'r', encoding='utf-8') as file:
45-
bad_words[language] = {line.strip() for line in file}
46-
return bad_words
47+
bad_words = set()
4748

48-
def compile_patterns(self) -> Dict[str, re.Pattern]:
49-
"""
50-
Compile regular expression patterns for profanity words.
49+
for lang in self.language_files:
50+
with (self.resource_dir / f"{lang}.bdw").open(encoding="utf-8") as f:
51+
bad_words.update(f.read().split())
5152

52-
:return: Dictionary mapping language names to compiled regex patterns.
53-
"""
54-
return {language: re.compile(r'\b(?:' + '|'.join(map(re.escape, words)) + r')\b', re.IGNORECASE) for language, words in self.bad_words.items()}
53+
return bad_words
5554

56-
def add_words(self, words: List[str]):
57-
"""
58-
Add custom profanity words to the filter.
55+
async def add_words(self: Self, words: list[str]) -> None:
56+
"""Add custom profanity words to the filter.
5957
6058
:param words: List of custom profanity words.
6159
"""
62-
self.custom_bad_words.update(words)
60+
self.bad_words.update(words)
6361

64-
def similar(self, a: str, b: str) -> float:
65-
"""
66-
Compute similarity ratio between two strings.
62+
async def similar(self: Self, a: str, b: str) -> float:
63+
"""Compute similarity ratio between two strings.
6764
6865
:param a: First string.
6966
:param b: Second string.
7067
:return: Similarity ratio.
7168
"""
7269
return SequenceMatcher(None, a, b).ratio()
7370

74-
def filter_text(self, text: str, match_threshold: float = 0.8, replace_character=None):
75-
"""
76-
Check if the given text contains profanity.
71+
async def filter_text(
72+
self: Self, text: str,
73+
match_threshold: float | None = None,
74+
replace_character: str | None = None,
75+
) -> bool | str:
76+
"""Check if the given text contains profanity.
7777
7878
:param text: Input text to check.
7979
:param match_threshold: Threshold for similarity match.
80-
:param replace_character: Character to replace profane words with. If None, return True/False.
81-
:return: True if profanity found, False otherwise. If replace_character is specified, return filtered text.
80+
:param replace_character: Character to replace profane words with. If None,
81+
return True/False.
82+
:return: True if profanity found, False otherwise. If replace_character is
83+
specified, return filtered text.
8284
"""
83-
all_bad_words = set.union(self.custom_bad_words, *self.bad_words.values())
84-
85-
words_in_text = text.lower().split(' ')
86-
filtered_text = text.lower()
87-
for word in words_in_text:
88-
for bad_word in all_bad_words:
89-
if self.similar(word, bad_word) > match_threshold:
90-
if replace_character is not None:
91-
filtered_text = filtered_text.replace(word, replace_character * len(word))
92-
else:
93-
return True if replace_character is None else filtered_text
94-
return False if replace_character is None else filtered_text
95-
96-
def get_all_languages(self) -> List[str]:
97-
"""
98-
Get a list of all available languages.
85+
if not match_threshold:
86+
match_threshold = 1
87+
88+
text = text.lower()
89+
90+
for word in text.split():
91+
if word in self.bad_words:
92+
return True
93+
94+
if 0 < match_threshold < 1:
95+
for bad_word in self.bad_words:
96+
if await self.similar(word, bad_word) > match_threshold:
97+
if replace_character:
98+
return text.replace(word)
99+
return True
100+
101+
return False
102+
103+
async def get_all_languages(self: Self) -> list[str]:
104+
"""Get a list of all available languages.
99105
100106
:return: List of all language names.
101107
"""
102-
return list(self.language_files.keys())
108+
return self.language_files

badwords/exceptions.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
"""Exceptions module."""
2+
3+
from typing import Self
4+
5+
6+
class NotSupportedLanguage(BaseException):
7+
"""Unsupport language check."""
8+
9+
def __str__(self: Self) -> str:
10+
"""String-like representation of exception."""
11+
return "This language is not supported"

0 commit comments

Comments
 (0)