昨天研究学习了一下 Stanford Parse ,想利用 Stanford Parse 智能切词的效果结合到lucene 分词器中的想法;由于项目时间
仓促,部分研究没有完成。代码还存在bug,希望有这方面想法的小伙伴们,能完善。。
lucene版本:lucene4.10.3,引入jar包:stanford-parser-3.3.0-models.jar ,stanford-parser.jar
先构建分词器测试类,代码如下:
package main.test; import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; public class AnalyzerTest { public static void analyzer(Analyzer analyzer,String text){ try { System.out.println("分词器名称:"+analyzer.getClass()); //获取tokenStream流 TokenStream tokenStream=analyzer.tokenStream("", new StringReader(text)); tokenStream.reset(); while(tokenStream.incrementToken()){ CharTermAttribute cta1=tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute ofa=tokenStream.getAttribute(OffsetAttribute.class); //位置增量的属性,存储词之间的距离 // PositionIncrementAttribute pia=tokenStream.getAttribute(PositionIncrementAttribute.class); // System.out.print(pia.getPositionIncrement()+":"); System.out.print("["+ofa.startOffset()+"-"+ofa.endOffset()+"]-->"+cta1.toString()+"\n"); } tokenStream.end(); tokenStream.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void main(String[] args){ String chText = "清华大学生说正在研究生命起源"; Analyzer analyzer = new NlpHhcAnalyzer(); analyzer(analyzer,chText); } }
重新定义一个新的分词器,实现Analyzer类,重写其:TokenStreamComponents createComponents 方法。这里注意:lucene4.x版
本的TokenStreamComponents 以组件的形式包含的lucene3.x版本的 filter和 tokenizer。
package main.test; import java.io.Reader; import org.apache.lucene.analysis.Analyzer; public class NlpHhcAnalyzer extends Analyzer{ @Override protected TokenStreamComponents createComponents(String arg0, Reader reader) { return new TokenStreamComponents(new aaa(reader)); } }
实现新的一个Tokenizer 类aaa: 这部分代码还有bug,没有时间去调试学习。。有时间的朋友可以试着完善一下。
package main.test; import java.io.IOException; import java.io.Reader; import java.util.Collection; import java.util.concurrent.ConcurrentHashMap; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.util.AttributeFactory; import edu.stanford.nlp.parser.lexparser.LexicalizedParser; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TypedDependency; import edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalStructure; public class aaa extends Tokenizer{ //词元文本属性 private CharTermAttribute termAtt; //词元位移属性 private OffsetAttribute offsetAtt; //记录最后一个词元的结束位置 // private int finalOffset; private String str; private LexicalizedParser lp; public aaa(Reader in) { super(in); StringBuilder sb=new StringBuilder(); try { for (int i = 0; i <100; i++) { sb.append((char) in.read()); } } catch (IOException e) { e.printStackTrace(); } str=sb.toString(); String modelpath="edu/stanford/nlp/models/lexparser/xinhuaFactoredSegmenting.ser.gz"; lp = LexicalizedParser.loadModel(modelpath); offsetAtt = addAttribute(OffsetAttribute.class); termAtt = addAttribute(CharTermAttribute.class); } protected aaa(AttributeFactory factory, Reader input) { super(factory, input); // TODO Auto-generated constructor stub } @SuppressWarnings("unchecked") @Override public boolean incrementToken() throws IOException { //清除所有的词元属性 clearAttributes(); Tree t = lp.parse(str); ChineseGrammaticalStructure gs = new ChineseGrammaticalStructure(t); Collection<TypedDependency> tdl = gs.typedDependenciesCollapsed(); ConcurrentHashMap map=new ConcurrentHashMap(); for(int i=0;i<tdl.size();i++) { TypedDependency td = (TypedDependency)tdl.toArray()[i]; String term = td.dep().nodeString().trim(); //将Lexeme转成Attributes //设置词元文本 termAtt.append(term); //设置词元长度 termAtt.setLength(term.length()); //设置词元位移 if(i==0){ map.put("beginPosition", i*term.length()); }else{ map.put("beginPosition", Integer.parseInt(map.get("beginPosition").toString())+term.length()); } offsetAtt.setOffset(Integer.parseInt(map.get("beginPosition").toString()), Integer.parseInt(map.get("beginPosition").toString())+term.length()); //记录分词的最后位置 // finalOffset = nextLexeme.getEndPosition(); //返会true告知还有下个词元 return true; } //返会false告知词元输出完毕 return false; } }
原文:http://blog.csdn.net/hu948162999/article/details/44648093