#英文统计词频 fo = open(‘yingwen.txt‘, ‘r‘, encoding=‘utf-8‘) str = fo.read() #通过文件读取字符串 str fo.close() print(str) ##预处理标点符号 str = str.replace(‘.‘,‘‘) print(str) ##预处理特殊字符 sep = ‘.,:‘‘;?!-_‘ for ch in sep: str = str.replace(ch,‘‘) print(str) #分解提取单词 list strlist = str.split() print(len(strlist),strlist) ##单词计数set strSet = set(strlist) print(len(strSet),strSet) ##单词计数dict strDict={} for word in strSet: strDict[word]=strlist.count(word) print(len(strDict),strDict) ##词频排序list.sort(key=) Dict = dict(strDict) DictList = list(Dict.items()) print(DictList) DictList.sort(key=lambda x:x[1],reverse = True) print(DictList) #排除语法型词汇,代词、冠词、连词等无语义词 strSet = set(strSet) exclude = {‘a‘,‘and‘,‘the‘,‘i‘,‘you‘} exset = strSet-exclude print(len(exset),exset) #.输出TOP(20) for i in range(20): print(DictList[i])
#中文词频统计 fo = open(‘水浒传.txt‘, ‘r‘, encoding=‘utf-8‘) strhu = fo.read() #通过文件读取字符串 str fo.close() print(strhu) #特殊符号处理 sep = ‘,。?!;:‘’“”‘ for sh in sep: strhu = strhu.replace(sh,‘‘) print(strhu) ##单词计数set strSet = set(strhu) print(len(strSet),strSet) ##单词计数dict strDict={} for word in strSet: strDict[word]=strhu.count(word) print(len(strDict),strDict) #把字典转换为列表 xhList = list(strDict.items()) print(xhList) #排序 xhList.sort(key=lambda x: x[1], reverse=True) print(xhList) #.输出TOP(20) for i in range(20): print(xhList[i])
原文:https://www.cnblogs.com/XLxielin/p/9720391.html