简单描述程序功能:python+flask
1.停用词为csv文件
2.源文件为txt文件
3.文本处理,将原文件中出现的停用词去除
4.根据用户web 表单输入,检索出包含用户输入参数的句子
代码实现:
1.文件读取,分词,源文件词频统计
python 读取 西班牙语文本编码: encoding=‘ISO-8859-1‘
1 #csv 文件读取,此处编码为西班牙语 2 def csvfile(): 3 file_path = os.path.join(upload_path, "SpanishStopWords.csv") 4 with open(file_path,‘r‘,encoding=‘ISO-8859-1‘) as f: 5 reader = csv.reader(f) 6 fieldnames = next(reader)#获取数据的第一列,作为后续要转为字典的键名 生成器,next方法获取 7 # print(fieldnames) 8 data1=[] 9 csv_reader = csv.DictReader(f,fieldnames=fieldnames) #self._fieldnames = fieldnames # list of keys for the dict 以list的形式存放键名 10 for row in csv_reader: 11 dic1={} 12 for k,v in row.items(): 13 dic1[k]=v 14 data1.append(dic1) 15 return data1 16 #txt文件读取 17 def eachcount(): 18 file_path = os.path.join(upload_path, "Alamo.txt") 19 txt = open(file_path, ‘r‘, encoding=‘ISO-8859-1‘).read() 20 #分词 21 txt = txt.replace(‘,‘, ‘ ‘).replace(‘.‘, ‘ ‘) 22 txt = txt.split() 23 counts = {} # 定义一个空字典类型 24 print(txt) 25 for word in txt: 26 counts[word] = counts.get(word, 0) + 1 # 获取word当前有几个,如果word不存在则为0 27 items = list(counts.items()) 28 # 对一个列表按照键值对的两个元素的第二个元素进行排序,由大到小的倒排,词频排序 29 items.sort(key=lambda x: x[1], reverse=False) 30 return items
2.显示在原文件中出现的所有停用词
#显示在源文件中出现过的所有停用词
@application.route(‘/listsearch/‘, methods=[‘GET‘, ‘POST‘])
def listsearch():
file_path = os.path.join(upload_path, "SpanishStopWords.csv")
txt = open(file_path, ‘r‘, encoding=‘ISO-8859-1‘).read()
# txt = txt.replace(‘,‘, ‘ ‘).replace(‘.‘, ‘ ‘)
txt = txt.split()
filelist=txt
# filelist=csvfile()
filelist2=docu2()
# wordlist=["my","name","boy","chirs","Dave"]
result=[]
result2=[]
# for j in wordlist:
# for i in filelist:
# if i[0]== j :
# result.append(i)
for j in filelist:
for i in filelist2:
if j== i :
result2.append(j)
return render_template(‘index.html‘,result2=result2)
前端代码展现:
<form action="/listsearch" method="get" enctype="multipart/form-data">
<button type="submit" value="submit">search</button>
<p>result</p>
{% for line2 in result2 %}
<p>{{ line2}}</p>
{% endfor %}
</form>
3.显示原文件中所有含有数字的句子
1 @application.route(‘/test1/‘, methods=[‘GET‘, ‘POST‘]) 2 def test1(): 3 file_path = os.path.join(upload_path, "Alamo.txt") 4 txt = open(file_path, ‘r‘, encoding=‘ISO-8859-1‘).read() 5 # txt = txt.replace(‘,‘, ‘ ‘).replace(‘.‘, ‘ ‘) 6 txt = txt.split(‘.‘) 7 filelist=txt 8 result2=[] 9 for j in filelist: 10 #使用正则表达式匹配数字 11 if re.match(‘.*[0-9].*‘, j) != None: 12 result2.append(j) 13 return render_template(‘index.html‘,result9=result2)
4.用户web 表单输入参数,根据用户输入,显示源文件中包含用户输入参数的句子。
1 @application.route(‘/test2/‘, methods=[‘GET‘, ‘POST‘]) 2 def test2(): 3 word = request.args.get("word10") 4 file_path = os.path.join(upload_path, "Alamo.txt") 5 txt = open(file_path, ‘r‘, encoding=‘ISO-8859-1‘).read() 6 # txt = txt.replace(‘,‘, ‘ ‘).replace(‘.‘, ‘ ‘) 7 txt = txt.split(‘.‘) 8 filelist=txt 9 result=[] 10 result2=[] 11 for j in filelist: 12 if word in j : 13 result2.append(j) 14 return render_template(‘index.html‘,result10=result2)
前端代码展现:
1 <form action="/test1" method="get" enctype="multipart/form-data"> 2 <button type="submit" value="submit">submit</button> 3 {% for li in result9 %} 4 <p>{{ li}}</p> 5 6 {% endfor %} 7 </form> 8 9 <form action="/test2" method="get" enctype="multipart/form-data"> 11 <input type="text" placeholder="word" name="word10"> 13 <button type="submit" value="submit">submit</button> 14 {% for li in result10 %} 15 <p>{{ li}}</p> 16 17 {% endfor %} 18 </form>
原文:https://www.cnblogs.com/bocaimao/p/13339049.html