首页 > 其他 > 详细

作业12

时间:2018-12-05 23:50:47      阅读:270      评论:0      收藏:0      [点我收藏+]
text = ‘‘‘The world is changing
and time is spinning fast
it‘s so amazing how you came into my life
I know it seems all hope is gone
I know you feel you can‘t be strong
and once again the story ends with you and I
And anytime you feel like you just can‘t go on
just hold on to my love
and you‘ll never be alone
Hold on
we can make it through the fire
and my love   ‘‘‘

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#预处理
def preprocessing(text):
    #text=text.decode("utf-8)
    tokens=[word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    stops=stopwords.words(english)
    tokens=[token for token in tokens if token not in stops]

    tokens=[token.lower() for token in tokens if len(token)>=3]
    lmtzr=WordNetLemmatizer()
    tokens=[lmtzr.lemmatize(token) for token in tokens]
    preprocessed_text= .join(tokens)
    return preprocessed_text
preprocessing (text)

#读取数据集
import csv
file_path=rD:\SMSSpamCollectionjs.txt
sms=open(file_path,r,encoding=utf-8)
sms_data=[]
sms_label=[]
csv_rreader=csv.reader(sms,delimiter=\t)
for line in csv_reader:
    sms_label.append(line[0])
    sms_data.append(preprocessing(line[1]))
sms.close()

#按0.7,0.3比例分为训练集和测试集
import numpy as np
sms_data=np.array(sms_data)
sms_label=np.array(sms_label)

from sklearn.model_selection import train_text_split
x_train, x_test, y_train, y_test = train_text_split(sms_data, sms_label, test_size=0.3, random_state=0, stratify=sms_label)

#将其向量化
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(min_df=2,ngram_range=(1,2),stop_words=english,strip_accents=unicode,norm=l2)
X_train=vectorizer.fit_transform(x_train)
X_test=vectorizer.transform(x_test)

X_train
a=X_train.toarray()
print(a)

for i in range(1000):
    for j in range(5984):
        if a[i,j]!=0:
            print(i,j,a[i,j])

#朴素贝叶斯分类器
from sklearn.navie_bayes import MultionmialNB
clf= MultionmialNB().fit(X_train,y_train)
y_nb_pred=clf.predict(X_test)

#分类结果显示
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(y_nb_pred.shape, y_nb_pred) #x_test预测结果

#
from sklearn.metrics import classification_report
cr = classification_report(y_nb_pred,y_test)
print(cr)

 

作业12

原文:https://www.cnblogs.com/tangpaiq-/p/10074122.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!