-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCode
More file actions
77 lines (47 loc) · 1.86 KB
/
Code
File metadata and controls
77 lines (47 loc) · 1.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
#import dataset
spam_df= pd.read_csv("spam.csv")
#inspect data
spam_df
#group by
spam_df.groupby("Category").describe()
spam_df['spam']= spam_df['Category'].apply(lambda x:1 if x == 'spam' else 0)
# turn spam/ham into numerical data , creating a new column called "spam"
# lambda is a dummy function which stores a argument x from column Category and apply if else condition from "spam" or "not spam"
spam_df
# here spam column makes it easier to classify our data as ham or spam into numerical basis 0 or 1
'''---------------------------------------------------------------------------------------------------'''
#create a train test split
x_train,x_test,y_train,y_test = train_test_split(spam_df.Message, spam_df.spam)
# above x variable is going to be my feature as message
# by default test_size = 0.25 we can adjust it like 0.50 or 0.75
x_train
# features are word count
# labels are spam or not spam
x_train.describe()
# we've 4179 total emails in split
# find word count and store data as numerical matrix
cv = CountVectorizer()
x_train_count =cv.fit_transform(x_train.values)
# fit_transform used for transform all word counts into matrix
x_train_count # numerical data
# sparse matrix has 4179 rows across this msgs 7529 are unique words
x_train_count.toarray()
# train model
model = MultinomialNB()
model.fit(x_train_count,y_train)
# pre-test ham
email_ham = ["could you help me ?"]
email_ham_count = cv.transform(email_ham)
model.predict(email_ham_count)
# pre-test spam
email_spam = ["free"]
email_spam_count = cv.transform(email_spam)
model.predict(email_spam_count)
# test model
x_test_count = cv.transform(x_test)
model.score(x_test_count,y_test)