-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalysis.py
More file actions
65 lines (52 loc) · 1.92 KB
/
analysis.py
File metadata and controls
65 lines (52 loc) · 1.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""
A set of analysis functions, in order to address the required questions
"""
from matplotlib.artist import Artist
import pandas as pd
from collections.abc import Sequence
import logging
import configparser
import tokenization
config = configparser.ConfigParser()
config.read("./config/config.ini")
FILTERS = dict(config["FILTERS"])
RESULTS_DIR = tokenization.form_results_folder_name(FILTERS)
TGT_DIR = config["DIR_PATH"]["TGT_DIR"]
TOKENIZED_DIR = config["DIR_PATH"]["TOKENIZED_DIR"]
# logging.basicConfig(
# filename=f"{config['DIR_PATH']['LOG_DIR']}/results.log",
# format="%(levelname)s:%(funcName)s:%(asctime)s:%(message)s",
# level=logging.INFO,
# )
def extract_words_to_percentage(
words_df: pd.DataFrame, required_percentages: Sequence
) -> Sequence:
"""
For a specific percentage, how many words are there?
"""
words_df["freq_cumsum"] = words_df["frequency"].cumsum()
for percentage in required_percentages:
words_of_interest = (words_df["freq_cumsum"] < percentage).value_counts()
print(f"Percentage: {percentage}")
print(words_of_interest)
def n_gram_analysis(data: dict, n_grams: int = 2):
"""
Given the tokenized articles, return the n-gram dictionaries
"""
n_grams_lists = []
for article in data:
article_content = article["article_content"]["content"]
single_words = article_content.split(" ")
for word_index, word in enumerate(single_words[: -(n_grams - 1)]):
one_n_grams = []
for word_sub_index in range(n_grams):
one_n_grams.append(single_words[word_index + word_sub_index])
n_grams_lists.append(one_n_grams)
return n_grams_lists
def main():
logging.info("/**/" * 20)
logging.info("Fresh new start")
all_words = pd.read_csv(f"{TGT_DIR}/{RESULTS_DIR}/allWords.csv")
extract_words_to_percentage(all_words, [10, 20, 50])
if __name__ == "__main__":
main()