标签:isa enc tran for .text form pst replace cat
import
csv
file_path
=
r
‘EmailData.txt‘
EmailData
=
open
(file_path,
‘r‘
,encoding
=
‘utf-8‘
)
Email_data
=
[]
Email_target
=
[]
csv_reader
=
csv.reader(EmailData,delimiter
=
‘\t‘
)
for
line
in
csv_reader:
Email_data.append(line[
1
])
Email_target.append(line[
0
])
EmailData.close()
Email_data_clear
=
[]
for
line
in
Email_data:
for
char
in
line:
if
char.isalpha()
is
False
:
newString
=
line.replace(char,
" "
)
tempList
=
newString.split(
" "
)
Email_data_clear.append(tempList)
Email_data_clear2
=
[]
for
line
in
Email_data_clear:
tempList
=
[]
for
word
in
line:
if
word !
=
‘‘
and
len
(word) >
3
and
word.isalpha():
tempList.append(word)
tempString
=
‘ ‘
.join(tempList)
Email_data_clear2.append(tempString)
Email_data_clear
=
Email_data_clear2
from
sklearn.model_selection
import
train_test_split
x_train,x_test,y_train,y_test
=
train_test_split(Email_data_clear2,Email_target,test_size
=
0.3
,random_state
=
0
,stratify
=
Email_target)
from
sklearn.feature_extraction.text
import
TfidfVectorizer
tfidf
=
TfidfVectorizer()
X_train
=
tfidf.fit_transform(x_train)
X_test
=
tfidf.transform(x_test)
import
numpy as np
X_train
=
X_train.toarray()
X_test
=
X_test.toarray()
X_train.shape
for
i
in
range
(X_train.shape[
0
]):
for
j
in
range
(X_train.shape[
1
]):
if
X_train[i][j] !
=
0
:
print
(i,j,X_train[i][j])
from
sklearn.naive_bayes
import
GaussianNB
gnb
=
GaussianNB()
module
=
gnb.fit(X_train,y_train)
y_predict
=
module.predict(X_test)
from
sklearn.metrics
import
classification_report
cr
=
classification_report(y_predict,y_test)
print
(cr)
标签:isa enc tran for .text form pst replace cat
原文地址:https://www.cnblogs.com/z233/p/10079664.html