首页 > 其他 > 详细

根据职位名,自动生成jd

时间:2015-10-13 17:01:58      阅读:454      评论:0      收藏:0      [点我收藏+]

代码本身就是最好的解释,不赘述。

文本聚类输出: cluster.py

#!/usr/bin/env python
# coding=utf-8

import jieba,re
from gensim import corpora,models
from sklearn.cluster import KMeans
import sys
reload(sys)
sys.setdefaultencoding(utf-8)


class MyCorpus(object):
    def __init__(self,fname):
        self.fname = fname

    def __iter__(self):
        for line in open(self.fname):
            yield jieba.cut(line,cut_all=False)


class MyCluster(object):

    def __init__(self):
        self.CLEAN = re.compile(ur"[^\u4e00-\u9f5aA-Za-z0-9]")
        self.dictionary = {}
        self.corpus = []

    
    def gen_dataset(self,documents):
        self.gen_corpus(documents)
        res = [self.doc2vec(doc) for doc in documents]
        return res


    def gen_corpus(self,documents):
        texts = [ list(jieba.cut(doc)) for doc in documents ]
        self.dictionary = corpora.Dictionary(texts)
        self.corpus = [self.dictionary.doc2bow(text) for text in texts]
        self.tfidf = models.TfidfModel(self.corpus)


    def doc2vec(self,doc):
        vec =  self.dictionary.doc2bow(jieba.cut(doc))
        vec = self.tfidf[vec]
        wordlist = [.0] * len(self.dictionary)
        for w in vec:
            wordlist[w[0]] = w[1]
        return wordlist
            

    def kcluster(self,texts,k=3):
        from random import shuffle
        data = self.gen_dataset(texts)
        data = [ map(lambda x:round(x,5),line) for line in data ]
        km = KMeans(n_clusters=k,init=k-means++,max_iter=200,n_init=1,verbose=True)
        km.fit(data)
        labels = km.labels_
        flag = [0]*len(labels)
        randomtext = zip(labels,texts)
        shuffle(randomtext)
        res = []
        for d in randomtext:
            if flag[d[0]]==0:
                res.append(d[1])
                flag[d[0]] = 1

        return res


if __name__ == "__main__":
    texts = [ line for line in open(data/python.db) ]
    test = MyCluster()
    res = test.kcluster(texts,k=4)

    print \n.join(res)

 

 

自动生成主文件: auto_gen_jd.py

#!/usr/bin/env python
# coding=utf-8

import sys,os
import simplejson as json
import codecs
# from snownlp import SnowNLP
from simhash import Simhash
# from bosonnlp import BosonNLP
from cluster import MyCluster
from jd_parser import JdParser
import re
reload(sys)
sys.setdefaultencoding(utf-8)

class AutoGenJD(object):
    ‘‘‘ 自动生成JD,输入一个职位名 和句子数,输出一份岗位描述和要求 ‘‘‘

    def __init__(self):
        self.CLEAR_NUM = re.compile(u"^\d+[\.、::]|^[\(\(]\d+[\)\)\.]?|\d\s*[\))】]")
        self.CLEAR_COLO = re.compile(u"^[。\.)(【】]\S+|[\.;:;。]$")
        self.jd_database = json.load(codecs.open(data/lagou_jd_clean.json))
    #   self.jobname = [ jobname[:-3] for jobname in os.listdir("data") if jobname.endswith(".db") ]
        self.jobname = self.jd_database.keys()
    #   self.bosonnlp = BosonNLP(‘UYTG1Csb.3652.5pZ2otkIncEn‘)
        self.jdparser = JdParser()
        self.km = MyCluster()

    def load_json_data(self,fname="../preprocess/data/mini_jd.json",arg1=None,arg2=None):
        for line in codecs.open(fname):
            try:
                data = json.loads(line)
            except Exception,e:
                print e
                continue
            if data.get(arg1,False) != False and data[arg1].has_key("job_title") and data[arg1].has_key("job_description"):
                if len(data[arg1]["job_title"])<2 or len(data[arg1]["job_title"])>16:
                    continue
                else:
                    fw = codecs.open(./data/+data[arg1][arg2]+".txt",w,utf-8)
                    fw.write(data[arg1]["job_description"].strip()+"\n\n")
                    print "writing...",data[arg1][arg2]
    
    # 去除 序列号等清洗数据
    def clean_jd(self,fname="./data/java.txt"):
        clean_sents = set()
        with codecs.open(fname+".txt",r,utf-8) as fr:
            for line in fr:
                line = self.CLEAR_NUM.sub("",line.strip())
                line = self.CLEAR_COLO.sub("",line.strip())
                if len(line)>2:
                    clean_sents.add(line.strip())
        with codecs.open(fname[:-3]+"db",w,utf-8) as fw:
            for line in clean_sents:
                fw.write(line+\n)
        return clean_sents
   
    def is_most_english(self,line):
        en_word = [ uchar for uchar in line if (uchar>=u\u0041 and uchar<=u\u005a) or (uchar>=u\u0061 and uchar<=u\u007a) ]
        return float(len(en_word)*1.0/len(line))>0.7

    def clean_jd2(self,jdstr):
        """
        清洗数据,去除句子前后的标点符合,序号等杂乱数据
        """
        res = set()
        for line in jdstr.split("\n"):
            line = line.strip()
            if len(line)<12:
                print "line",line
            if re.search(u"[;\.;。]\d+|\d?[,,、::\.]$|^\d\s{0,1}[\u4e00-\u9f5e]",line) or len(line)<8 or len(line)>32:continue
            if self.is_most_english(line):continue
            line = self.CLEAR_NUM.sub("",line)
            line = self.CLEAR_COLO.sub("",line)
            res.add(line)
        return res
        

    # 获取和用户输入相似度最近的职位名
    def get_closet_job(self,jobname="java"):
        dis = [ (other,Simhash(jobname).distance(Simhash(other))) for other in self.jobname ]
        sorteddis = sorted(dis,key = lambda x:x[1])
        for k,v in sorteddis[:5]:
            print k,v
        return sorteddis[0][0]
    
    # 规范化jd句子数目
    def norm_jd_num(self,num):
        if num<1:
            num=1
        elif num>20:
            num = 20
        return num


    # 根据职位名和句子数,获得jd
    def get_jd_with_snownlp(self,jobname="java",num=5):
        jobname = self.get_closet_job(jobname)
      #  with open("./data/"+jobname+".db") as fr:
      #      s = SnowNLP(fr.read())
      #      return s.summary(num)
        jdstr = self.clean_jd2(self.jd_database[jobname])
        s = SnowNLP(jdstr)
        return s.summary(num)

    def get_jd_with_bosonnlp(self,jobname="java",num=5):

        res = set()
        jobname = self.get_closet_job(jobname)
        jdstr = self.clean_jd2(self.jd_database[jobname])[:80]
        all_cluster = self.bosonnlp.cluster(jdstr)
        sort_all_cluster = sorted(all_cluster,key = lambda x:x[num],reverse=True)
        for idx,cluster in enumerate(sort_all_cluster):
            print idx+1,cluster[_id]
            res.add(jdstr[cluster[_id]])
        return res


    def _get_sent_score(self,line):
        """
        句子得分,最后结果排序使用,分值越小,排序越靠前
        """
        s = len(line)+100
        if re.search(u"男|女|男女不限|性别|岁",line):
            s -= 60
        if re.search(u"学历|专业|\d+[kK元]",line):
            s -= 40
        if re.search(u"经验",line):
            s -= 20
        return s
            

    def get_jd_with_kmeans(self,jobname=python,num=6):
        """
        使用kmeans 进行聚类,相同一类只出现一句
        """
        jobname = self.get_closet_job(jobname)
        jdstr = self.clean_jd2(self.jd_database[jobname])
        print "jdstr",len(jdstr)
        print self.jd_database[jobname]

        if len(jdstr)<int(num):
            num = len(jdstr)
        res = self.km.kcluster(jdstr,k=int(num))
        return sorted(res,cmp=lambda x,y:self._get_sent_score(x)-self._get_sent_score(y))


    def jd_parser(self,jdstr):
        result = self.jdparser.parser(jdstr) 
        return result

if __name__ == "__main__":

    test = AutoGenJD()
    jobname = sys.argv[1]
    jdnum = int(sys.argv[2])
    print "job name:",jobname
    print "demand:"
    demand = test.get_jd_with_kmeans(jobname,jdnum)
    for i,jdstr in enumerate(demand):
        print "%d. %s" %(i+1,jdstr)

 

根据职位名,自动生成jd

原文:http://www.cnblogs.com/jkmiao/p/4874803.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!