#!usr/bin/python
#coding=utf-8
import urllib2
import sys, time, re
import sys
import jieba
jieba.load_userdict("userdict.txt")
import jieba.analyse
import jieba.posseg as pseg
import os
jieba.initialize()
import operator
reload(sys);
sys.setdefaultencoding(‘utf8‘);
t1 = time.time()
url = "10.txt"
content = open(url, "rb").read()
#print type(content)
print ‘文章长度:‘, len(content)
strRe = re.sub(‘\s‘, ‘‘, content) #用正则干掉所有的空白
print ‘用正则干掉所有的空白后,字符长度‘, len(strRe)
‘‘‘
fo = open("foo.txt", "wb")
fo.write(strRe);
# 关闭打开的文件
fo.close()
‘‘‘
#分词, 未登录词用veterbi分词
words = list(jieba.cut(strRe, cut_all=False))
print "分词的总数:", len(words)
wordset = sorted(set(words))
print "不重复的单词数:", len(wordset)
#TF-IDF
jieba.analyse.set_idf_path("extra_dict/idf.txt.big");
tf_idf_tags = jieba.analyse.extract_tags(strRe, topK = 10)
print "TF-IDF 未去除停用词, 获取10个关键词"
print(",".join(tf_idf_tags))
jieba.analyse.set_idf_path("extra_dict/idf.txt.big");
jieba.analyse.set_stop_words("extra_dict/cn_stop_words.txt")
tf_idf_stop_words_tags = jieba.analyse.extract_tags(strRe, topK = 10)
print "TF-IDF 去除停用词"
print(",".join(tf_idf_stop_words_tags))
#TextRank
#tagswords = jieba.analyse.textrank(content)
#print(",".join(tagswords))
print "TextRank, 获取10个关键词"
TextRank_words = jieba.analyse.textrank(strRe)
print(",".join(TextRank_words))
‘‘‘
list = words
fl = open(‘list.txt‘, ‘wb‘)
for i in range(len(list)):
fl.write(list[i].encode(‘utf-8‘)+‘--‘)
fl.close()
‘‘‘
# 统计分词结果后,每个个分词的次数
wordsDict = {}
DictsMaxWordlen = 0
singal = ‘‘
for w in words:
if wordsDict.get(w) == None:
wordsDict[w] = 1
else:
wordsDict[w] += 1
if DictsMaxWordlen <= wordsDict[w]:
DictsMaxWordlen = wordsDict[w]
global singal
singal = w
#print w
print "分词最多重复的次数:".decode(‘utf-8‘), DictsMaxWordlen , "分词是:".decode(‘utf-8‘),singal
#按字典值排序(默认为升序),返回值是字典{key, tuple}
sorted_wordsDict = sorted(wordsDict.iteritems(), key=operator.itemgetter(1))
#print type(sorted_wordsDict[1]) #tuple
classNumWord = {}
for w in sorted_wordsDict:
if classNumWord.has_key(w[1]) == True:
if w[0] not in classNumWord[w[1]]:
classNumWord[w[1]].append(w[0])
else:
classNumWord[w[1]] = []
classNumWord[w[1]].append(w[0])
#将字典排序,按照升序, 通过键排序,
sort_classNumWord = sorted(classNumWord.iteritems(), key=lambda asd:asd[0], reverse = False)
#print sort_classNumWord[20][1][0].encode(‘gb2312‘)
wordslength = 0 #分词的总数
worldsNum = 0 #分词有多少个不同的词或词组
wordsFequencelist = {} #分词出现的频次等级,从1到N次,并存储所对应等级的词语个数
for w in sort_classNumWord:
worldsNum += w[0]
wordslength += len(w[1]) * w[0]
wordsFequencelist[w[0]] = []
wordsFequencelist[w[0]].append(len(w[1]))
#print "============================"
#for i in range(len(w[1])): #按照出现的频次,打印词组
# print w[1][i]
#print "出现".decode(‘utf-8‘),w[0], "次的有:".decode(‘utf-8‘) ,len(w[1])
#print "============================"
sort_wordsFequencelist = sorted(wordsFequencelist.iteritems(), key=lambda asd:asd[0], reverse = False)
print ‘\t\t频率是单词出现的次数, 次数是出现对应次数的所有不同单词的总和‘
lenWords = 0
for wordsFequence in sort_wordsFequencelist:
lenWords += 1
print ‘频率:{0:<4} 词数:{1:>6}‘.format(wordsFequence[0], wordsFequence[1]), " ",
if lenWords % 4 == 0:
print
print
print "一共有".decode(‘utf-8‘), worldsNum, ‘个不同的词或词组‘.decode(‘utf-8‘)
print "一共有".decode(‘utf-8‘), wordslength, ‘个词或词组‘.decode(‘utf-8‘)
print
print
t2 = time.time()
tm_cost = t2-t1
print ‘运行时间‘, tm_cost
Building prefix dict from C:\Python27\lib\site-packages\jieba-0.36.2-py2.7.egg\jieba\dict.txt ...
Dumping model to file cache c:\users\og\appdata\local\temp\jieba.cache
Loading model cost 2.16899991035 seconds.
Prefix dict has been built succesfully.
原文:http://www.cnblogs.com/hgonlywj/p/4842689.html