-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheda.py
More file actions
50 lines (33 loc) · 1.43 KB
/
eda.py
File metadata and controls
50 lines (33 loc) · 1.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import pandas as pd
from pathlib import Path
BASE_DIR = Path(__file__).resolve().parent
DATA_PATH = BASE_DIR.parent / "data" / "fake_job_postings.csv"
df = pd.read_csv(DATA_PATH)
print(df.head())
df.shape
df['fraudulent'].value_counts()
pd.crosstab(df['has_company_logo'], df['fraudulent'], normalize='index')
df['salary_range'].isnull().groupby(df['fraudulent']).mean()
df['desc_length'] = df['description'].astype(str).apply(len)
df.groupby('fraudulent')['desc_length'].mean()
urgency_words = [
'urgent', 'immediate', 'limited', 'apply fast',
'hurry', 'few slots', 'act now'
]
def urgency_score(text):
text = str(text).lower()
return sum(word in text for word in urgency_words)
df['urgency_score'] = df['description'].apply(urgency_score)
df.groupby('fraudulent')['urgency_score'].mean()
free_domains = ['gmail.com', 'yahoo.com', 'outlook.com', 'hotmail.com']
def free_email_flag(text):
text = str(text).lower()
return any(domain in text for domain in free_domains)
df['free_email'] = df['company_profile'].apply(free_email_flag)
df.groupby('fraudulent')['free_email'].mean()
free_domains = ['gmail.com', 'yahoo.com', 'outlook.com', 'hotmail.com']
def free_email_flag(text):
text = str(text).lower()
return any(domain in text for domain in free_domains)
df['free_email'] = df['company_profile'].apply(free_email_flag)
df.groupby('fraudulent')['free_email'].mean()