首页 > 其他 > 详细

统计文本英文单词总个数,并列出每个单词的个数

时间:2014-02-27 12:44:40      阅读:538      评论:0      收藏:0      [点我收藏+]
package test;
/*
 * Task :统计文本英文单词总个数,并列出每个单词的个数
 *
 * Date:2014.02.26
 *
 *Author:璀若星辰
 * */
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class IO_Word {
	  public static List<String>Io_word(String str)throws Exception{
	    File file = new File(str);
	    int n = 0;//文章中单词总数
	    TreeMap<Object, Integer> myTreeMap = new TreeMap<Object, Integer>();//存放键值对
	    Object word = null;//文章中的单词
	    Object num = null;//出现的次数
	    FileInputStream fis = new FileInputStream(file);
	    try{
	      InputStreamReader isr = new InputStreamReader(fis, "gb2312");
	      try{
	         BufferedReader br = new BufferedReader(isr);
	         try{
	           List<String> all = new ArrayList<String>();
	           String temp = br.readLine();
	           while (temp !=null){
	             all.add(temp);
	             temp = br.readLine();
	           }
	           //System.out.println("all="+all.size());
	          // System.out.println(all.get(0));
	           Pattern expression = Pattern.compile("[a-zA-Z]+");//定义正则表达式匹配单词
	           String string1 = all.toString().toLowerCase();//转换成小写
	           Matcher matcher = expression.matcher(string1);//定义string1的匹配器
	           while(matcher.find()){
	             word = matcher.group();//得到一个单词—树映射的键
	             //System.out.println("word="+word);
	             n++;
	             if(myTreeMap.containsKey(word)){
	               num = myTreeMap.get(word);//得到单词出现的次数
	               Integer count = (Integer)num;
	               myTreeMap.put(word, new Integer(count.intValue()+1));
	             }else {
	               myTreeMap.put(word, new Integer(1));//否则单词第一次出现,添加到映射中
	             }
	           }
	           System.out.println("统计分析如下:");
	           System.out.println("txt文章中单词总数"+ n +"个");
	           /*Iterator<Object> iter = myTreeMap.keySet().iterator();//得到树映射键集合的迭代器
	           while(iter.hasNext()){
	             key = iter.next();
	             System.out.println(((String)key+"-"+myTreeMap.get(key)));
	           }*/
	           List<Map.Entry<Object, Integer>> list = new ArrayList<Map.Entry<Object,Integer>>(myTreeMap.entrySet());
	           System.out.println("list="+list.size());
	           Collections.sort(list,new Comparator<Map.Entry<Object, Integer>>(){

	            public int compare(Map.Entry<Object, Integer>zj,  Map.Entry<Object, Integer> zz) {
	              return (zz.getValue() - zj.getValue());
	            }
	           });
	           for (Entry<Object, Integer> entry : list) {
	            System.out.println(entry.getKey() + "-" + entry.getValue() );
	          }
	           return all;
	         }finally{
	           br.close();
	         }
	      }finally{
	        isr.close();
	      }
	    }finally{
	      fis.close();
	    }
	  }
	  public static void main(String[] args) {
		  try {
		      IO_Word.Io_word("D:/abc.txt");
		    } catch (Exception e) {
		      e.printStackTrace();
		    }
	}
	}
 

 运行结果效果如下

bubuko.com,布布扣

统计文本英文单词总个数,并列出每个单词的个数,布布扣,bubuko.com

统计文本英文单词总个数,并列出每个单词的个数

原文:http://blog.csdn.net/u012631267/article/details/19975307

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!