记录训练word2vec流程,以做备忘。代码如下:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
‘‘‘
训练wordvec词向量
1. 后期需要单独处理[PAD]与[UNK]等特殊字符
2. 后期需要注意未登录词的处理
‘‘‘
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
# import os
# os.environ[‘CUDA_VISIBLE_DEVICES‘] = "0"
def get_total_text_file(train_file, dev_file, test_file, output_file):
‘‘‘获取所有字符,用于训练word2vec词向量,注意要分字(字与字之间空格隔开)或者分词‘‘‘
with open(train_file, ‘r‘, encoding=‘utf-8‘) as f1, open(dev_file, ‘r‘, encoding=‘utf-8‘) as f2, open(test_file, ‘r‘, encoding=‘utf-8‘) as f3, open(output_file, ‘w+‘, encoding=‘utf-8‘) as f:
for line in f1.readlines():
text1, text2, label = line.split(‘\t‘)
new_line = ‘ ‘.join([char for char in text1 + text2])
f.write(new_line + ‘\n‘)
for line in f2.readlines():
text1, text2, label = line.split(‘\t‘)
new_line = ‘ ‘.join([char for char in text1 + text2])
f.write(new_line + ‘\n‘)
for line in f3.readlines():
text1, text2, label = line.split(‘\t‘)
new_line = ‘ ‘.join([char for char in text1 + text2])
f.write(new_line + ‘\n‘)
return
def Vectorize_training(input_file_path, output_model_path, output_w2v_path):
‘‘‘
输入文本格式:每一行如:
大 家 觉 得 她 好 看 吗
求 秋 色 之 空 漫 画 全 集
‘‘‘
wiki_news = open(input_file_path,‘r‘,encoding=‘utf-8‘)
# Word2Vec第一个参数表示预处理之后的训练语料库,sg=0表示使用CBOW模型,size表示词向量的维度,
# min_count表示过滤掉出现次数小于min_count的单词,workers表示线程数
model = Word2Vec(LineSentence(wiki_news),sg=0,size=300,window=5,min_count=1,workers=2)
model.save(output_model_path) # 保存模型
‘‘‘
保存词向量,格式如下
2032 300
[PAD] 0.02861733 - 0.08838269 - 0.053209875 - 0.07564939 - 0.083910674
‘‘‘
model.wv.save_word2vec_format(output_w2v_path)
if __name__ == ‘__main__‘:
# get_total_text_file(train_file, dev_file, test_file, output_file)
input_file_path = ‘./data/比赛数据集/new_total_train_w2v.tsv‘
output_model_path = ‘./data/比赛数据集/sim_text_comp.w2v_model‘
output_w2v_path = ‘./data/比赛数据集/sim_text_comp_word2vec.bin‘
Vectorize_training(input_file_path, output_model_path, output_w2v_path)
model = Word2Vec.load(‘./data/比赛数据集/sim_text_comp.w2v_model‘)
print(model[‘湖‘].shape) # (300,)
# print(model.wv[‘湖‘])
print(model.wv.vectors.shape) # (1874, 300)
# a = model.wv.index2word()
# print(a) # 获得所有的词汇
# for word in model.wv.index2word():
# print(word, model[word]) # 获得词汇及其对应的向量
原文:https://www.cnblogs.com/lyiheng/p/14472867.html