text = ‘‘‘The world is changing and time is spinning fast it‘s so amazing how you came into my life I know it seems all hope is gone I know you feel you can‘t be strong and once again the story ends with you and I And anytime you feel like you just can‘t go on just hold on to my love and you‘ll never be alone Hold on we can make it through the fire and my love ‘‘‘ import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer #预处理 def preprocessing(text): #text=text.decode("utf-8) tokens=[word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] stops=stopwords.words(‘english‘) tokens=[token for token in tokens if token not in stops] tokens=[token.lower() for token in tokens if len(token)>=3] lmtzr=WordNetLemmatizer() tokens=[lmtzr.lemmatize(token) for token in tokens] preprocessed_text=‘ ‘.join(tokens) return preprocessed_text preprocessing (text) #读取数据集 import csv file_path=r‘D:\SMSSpamCollectionjs.txt‘ sms=open(file_path,‘r‘,encoding=‘utf-8‘) sms_data=[] sms_label=[] csv_rreader=csv.reader(sms,delimiter=‘\t‘) for line in csv_reader: sms_label.append(line[0]) sms_data.append(preprocessing(line[1])) sms.close() #按0.7,0.3比例分为训练集和测试集 import numpy as np sms_data=np.array(sms_data) sms_label=np.array(sms_label) from sklearn.model_selection import train_text_split x_train, x_test, y_train, y_test = train_text_split(sms_data, sms_label, test_size=0.3, random_state=0, stratify=sms_label) #将其向量化 from sklearn.feature_extraction.text import TfidfVectorizer vectorizer=TfidfVectorizer(min_df=2,ngram_range=(1,2),stop_words=‘english‘,strip_accents=‘unicode‘,norm=‘l2‘) X_train=vectorizer.fit_transform(x_train) X_test=vectorizer.transform(x_test) X_train a=X_train.toarray() print(a) for i in range(1000): for j in range(5984): if a[i,j]!=0: print(i,j,a[i,j]) #朴素贝叶斯分类器 from sklearn.navie_bayes import MultionmialNB clf= MultionmialNB().fit(X_train,y_train) y_nb_pred=clf.predict(X_test) #分类结果显示 from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report print(y_nb_pred.shape, y_nb_pred) #x_test预测结果 # from sklearn.metrics import classification_report cr = classification_report(y_nb_pred,y_test) print(cr)
原文:https://www.cnblogs.com/tangpaiq-/p/10074122.html