使用jdk 1.7的新api来统计jdk中英语单词的分布情况

时间：2015-03-25 21:40:09 阅读：380 评论：0 收藏：0 [点我收藏+]

哈哈，你想知道jdk源码中的英语单词分布情况么？下面我们介绍一种jdk1.7的新api来递归遍历文件夹。这个新的api位于java.nio.file.*下面，里面添加了更加符合语义的Path，也就是路劲这个对象。然后为了更好遍历目录，提供了一种访问者设计模式来访问递归访问目录。主要的框架代码如下：

<span style="font-size:18px;">public static class PrintFiles
    extends SimpleFileVisitor<Path> {

    // Print information about
    // each type of file.
    @Override
    public FileVisitResult visitFile(Path file,
                                   BasicFileAttributes attr) {
        if (attr.isSymbolicLink()) {
            System.out.format("Symbolic link: %s ", file);
        } else if (attr.isRegularFile()) {
            System.out.format("Regular file: %s ", file);
        } else {
            System.out.format("Other: %s ", file);
        }
        System.out.println("(" + attr.size() + "bytes)");
        return CONTINUE;
    }

    // Print each directory visited.
    @Override
    public FileVisitResult postVisitDirectory(Path dir,
                                          IOException exc) {
        System.out.format("Directory: %s%n", dir);
        return CONTINUE;
    }

    // If there is some error accessing
    // the file, let the user know.
    // If you don't override this method
    // and an error occurs, an IOException 
    // is thrown.
    @Override
    public FileVisitResult visitFileFailed(Path file,
                                       IOException exc) {
        System.err.println(exc);
        return CONTINUE;
    }
}</span>

这个接口主要有三个方法，访问文件、访问目录、访问失败。很明显，我们只需要继承该类，然后实现相关业务逻辑即可。当需要遍历某个目录是，直接调用Files.walkFileTree(startingDir, SimpleFileVistor)方法即可。其中上面的例子什么也不做，只是简单的打印出一些相关信息。

进入正题，怎么统计JDK源码中的英语单词分布呢？下面先分解下我们的任务。在这里面，需要一个类用于解析一个.java文件，并且将里面的单词都抽取出来，好吧，该类不难，无非就是读取类文件内容，然后使用相关的方式，将这些杂乱的内容变成单词即可。为了更加容易管理，这里将单词也抽象为一个类，具体代码如下：

package net.itaem.luohong.honghong;

public class Word implements Comparable<Word> {
    
	private String word;
    
    public Word(String word){
    	this.word = word;
    }
    
	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + ((word == null) ? 0 : word.hashCode());
		return result;
	}

	@Override
	public boolean equals(Object obj) {
		if (this == obj)
			return true;
		if (obj == null)
			return false;
		
		if (getClass() != obj.getClass())
			return false;
		
		Word other = (Word) obj;
		if (word == null) {
			if (other.word != null)
				return false;
		} else if (!word.equals(other.word))
			return false;
		
		return true;
	}

	@Override
	public String toString() {
		return word;
	}
	
	public String getWord(){
		return word;
	}

	public int compareTo(Word o) {
		if(this.getWord() == null){
			return -1;
		}
		
		if(o.getWord() == null){
			return 1;
		}
		
		return this.getWord().compareTo(o.getWord());
	}
}

package net.itaem.luohong.honghong;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

/**
 * 读取一个文件的内容
 * */
public class WordReader {

	public static List<Word> wordInFile(String path){
		return stringToWords(read(path));
	}

	/**
	 * 读取一个文件的内容，然后将内容返回
	 * 
	 * */
	private static String read(String path){
		StringBuilder sb = new StringBuilder();
		try {
			BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(path))));
			String line = null;
			while((line = reader.readLine()) != null){
				sb.append(line);
			}
			reader.close();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return sb.toString();
	}

	/**
	 * 将字符串变成单词数组
	 * */
	private static List<Word> stringToWords(String source){
		StringTokenizer strToken = new StringTokenizer(source);

		List<Word> wordList = new ArrayList<Word>();

		while(strToken.hasMoreTokens()){
			String wordStr = strToken.nextToken();

			if(!isInvalidString(wordStr)){
				wordStr = removeInvalidStr(wordStr);  //确保去除干净
				wordStr = removeInvalidStr(wordStr);

				if(wordStr != null && !"".equals(wordStr)){
					if(needSplitToWord(wordStr)){ 
						recurSplitToWord(wordStr, wordList);
					}else{  //不需要切割
						
						if(wordStr != null && !"".equals(wordStr)  && !isInvalidString(wordStr)){
							wordList.add(new Word(removeInvalidStr(wordStr)));
						}
					}
				}
			}
		}
		return wordList;
	}

	/**
	 * 判断字符串是否需要分割为多个单词
	 * @param wordStr
	 * @return
	 * */
	private static boolean needSplitToWord(String wordStr){
		if(wordStr == null || "".equals(wordStr)) return false;

		if(wordStr.contains(".") || wordStr.contains("/") || wordStr.contains("-") 
				|| wordStr.contains("#") || wordStr.contains("]") 
				|| wordStr.contains(",") || wordStr.contains("(") 
				|| wordStr.contains("[") || wordStr.contains(">") 
				|| wordStr.contains("<") || wordStr.contains("=")){
			return true;
		}else{
			return false;
		}
	}

	/**
	 * 递归切割字符串为单词列表
	 * 因为一个字符串可能同时包含多个分割单词的字符，所以需要递归切割
	 * 
	 * @param wordStr 要切割的字符串
	 * @param wordList 将切割的单词放入该list
	 * */
	private static void recurSplitToWord(String wordStr, List<Word> wordList){
		if(wordStr == null) return;
		
		if(needSplitToWord(wordStr)){
			String[] words = splitToWords0(wordStr);	
			for(String word: words){
				if(needSplitToWord(word)){
					recurSplitToWord(word, wordList);
				}else{
					if(word != null && !"".equals(word)  && !isInvalidString(word)){
						wordList.add(new Word(removeInvalidStr(word)));
					}
				}
			}
		}
	}
	
	/**
	 * 将一个字符串切割为单词数组
	 * @param str
	 * @return
	 * */
	private static String[] splitToWords0(String wordStr){
		String[] words = null;
		//split word
		if(wordStr.contains(".")){
			words = wordStr.split("\\.");
		}else if(wordStr.contains("/")){
			words = wordStr.split("/");
		}else if(wordStr.contains("-")){
			words = wordStr.split("-");
		}else if(wordStr.contains("#")){
			words = wordStr.split("#");
		}else if(wordStr.contains("]")){
			words = wordStr.split("]");
		}else if(wordStr.contains("[")){
			words = wordStr.split("\\[");
		}else if(wordStr.contains(",")){
			words = wordStr.split(",");
		}else if(wordStr.contains("(")){
			words = wordStr.split("\\(");
		}else if(wordStr.contains(">")){
			words = wordStr.split(">");
		}else if(wordStr.contains("<")){
			words = wordStr.split("<");
		}else if(wordStr.contains("=")){
			words = wordStr.split("=");
		}
		
		return words;
	}

	/**
	 * 去掉 + - * / >= <= =!= ||等无效字符
	 * @param str 字符串
	 * @return 
	 * */
	private static boolean isInvalidString(String str){
		if(str == null || str.equals("") || str.equals("*") || str.equals("{") ||str.equals("}")  || str.equals("+")  ||str.equals("-")  || str.equals("/")
				|| str.equals("=") || str.equals("!=") || str.equals(">") || str.equals("<") || str.equals(">=") || str.equals("<=")
				|| str.equals("||") || str.equals("}}") || str.equals("/*") || str.equals("};") || str.equals("+=") || str.matches("\\d+")
				|| str.equals(".") || str.equals(":") || str.equals("/**") || str.equals("//") || str.endsWith("==") || str.equals("?")
				|| (str.contains("==") && str.contains("&") && str.contains(";"))  //去掉o==null ? get这类字符
				
				){
			return true;
		}else{
			return false;
		}
	}

	/**
	 * 判断一个字符串是否是数字
	 * */
	private static boolean isNumber(String str){
		return str.matches("^[-+]?(([0-9]+)([.]([0-9]+))?|([.]([0-9]+))?)$");
	}

	/**
	 * 去掉一个字符串中的无效字符
	 * @param
	 * @return
	 * */
	private static String removeInvalidStr(String wordStr){
		if(isInvalidString(wordStr)){
			return null;
		}

		//去掉结尾;
		if(wordStr.endsWith(";")){
			wordStr = wordStr.substring(0, wordStr.length() - 1);
		}

		//去掉尾部,
		if(wordStr.endsWith(",")){
			wordStr = wordStr.substring(0, wordStr.length() - 1);
		}

		//去掉结尾()
		if(wordStr.endsWith("()")){
			wordStr = wordStr.substring(0, wordStr.length() - 2);
		}

		//去掉开头(
		if(wordStr.startsWith("(")){
			wordStr = wordStr.substring(1);
		}

		//去掉结尾)
		if(wordStr.endsWith(")")){
			wordStr = wordStr.substring(0, wordStr.length() - 1);
		}

		//去掉结尾:
		if(wordStr.endsWith(":")){
			wordStr = wordStr.substring(0, wordStr.length() - 1);
		}

		//去掉开头"
		if(wordStr.startsWith("\"")){
			wordStr = wordStr.substring(1);
		}

		//去掉结尾"
		if(wordStr.endsWith("\"")){
			wordStr = wordStr.substring(0, wordStr.length() - 1);
		}

		//去掉结尾处.
		if(wordStr.endsWith(".")){
			wordStr = wordStr.substring(0, wordStr.length() - 1);
		}

		//去掉开头*/
		if(wordStr.startsWith("*/")){
			wordStr = wordStr.substring(2);
		}

		//去掉java.util;/**结尾处的四个字符
		if(wordStr.endsWith(";/**")){
			wordStr = wordStr.substring(0, wordStr.length() - 4);
		}

		//去掉开头的{@
		if(wordStr.startsWith("{@")){
			wordStr = wordStr.substring(2);
		}

		//去掉开头的@
		if(wordStr.startsWith("@")){
			wordStr = wordStr.substring(1);
		}

		//取出下面该格式的单词，比如：<tt>hello</tt>取出hello，<li>world</li>取出为world，<T>取出为T，</pre>取出为pre，<pre>取出为pre
		if(wordStr.startsWith("<") && wordStr.endsWith(">")){
			if(wordStr.startsWith("<") && !wordStr.contains("</")){ //格式为<T>
				wordStr = wordStr.substring(wordStr.indexOf("<") + 1, wordStr.lastIndexOf(">"));

			}else if(wordStr.contains("</") && !wordStr.startsWith("</")){  //格式为：<tt>hello</tt>
				wordStr = wordStr.substring(wordStr.indexOf(">") + 1, wordStr.lastIndexOf("</"));

			}else if(wordStr.startsWith("</")){  //格式为</pre>
				wordStr = wordStr.substring(2, wordStr.lastIndexOf(">"));
			}	
		}

		//去掉<li>time中的<li>
		if(wordStr.contains("<") && wordStr.contains(">") && !wordStr.contains("/") && wordStr.startsWith("<")){
			wordStr = wordStr.substring(wordStr.lastIndexOf(">") + 1);
		}

		//去掉time<li>中的<li>
		if(wordStr.contains("<") && wordStr.contains(">") && !wordStr.contains("/") && wordStr.endsWith(">")){
			wordStr = wordStr.substring(0, wordStr.lastIndexOf("<"));
		}

		//去掉time</li>
		if(wordStr.contains("</") && wordStr.contains(">")){
			wordStr = wordStr.substring(0, wordStr.lastIndexOf("</"));
		}
		
		//去掉开头的<
		if(wordStr.startsWith("<")){
			wordStr = wordStr.substring(1);
		}
		
		//去掉结尾的>
		if(wordStr.endsWith(">")){
			wordStr = wordStr.substring(0, wordStr.length() -1);
		}

		//去掉结尾的[
		if(wordStr.startsWith("{")){
			wordStr = wordStr.substring(1);
		}

		//去掉开头的[
		if(wordStr.endsWith("}")){
			wordStr = wordStr.substring(0, wordStr.length() - 1);
		}
		
		//去掉开头的==
		if(wordStr.startsWith("==")){
			wordStr = wordStr.substring(2);
		}
		
		//去掉结尾的=
		if(wordStr.endsWith("=")){
			wordStr = wordStr.substring(0, wordStr.length() - 1);
		}
		

		//去掉结尾的{
		//去掉开头的[
		if(wordStr.endsWith("{")){
			wordStr = wordStr.substring(0, wordStr.length() - 1);
		}

		//去掉结尾的[
		if(wordStr.endsWith("[]")){
			wordStr = wordStr.substring(0, wordStr.length() - 2);
		}
		
		//去掉结尾的[
		if(wordStr.endsWith("]")){
			wordStr = wordStr.substring(0, wordStr.length() - 1);
		}

		//去掉结尾的[
		if(wordStr.endsWith("[")){
			wordStr = wordStr.substring(0, wordStr.length() - 1);
		}

		//去掉开头的[
		if(wordStr.startsWith("]")){
			wordStr = wordStr.substring(1);
		}

		//去掉开头的+
		if(wordStr.startsWith("+")){
			wordStr = wordStr.substring(1);
		}

		//去掉<?
		if(wordStr.endsWith("+")){
			wordStr = wordStr.substring(0, wordStr.length() - 1);
		}

		//去掉<?
		if(wordStr.endsWith("<?")){
			wordStr = wordStr.substring(0, wordStr.length() - 2);
		}

		//去掉"
		if(wordStr.contains("\"")){
			wordStr = wordStr.replace("\"", "");
		}

		//去掉开头的[
		//去掉数字
		if(isNumber(wordStr)){
			return null;
		}

		return wordStr;
	}

}

当然，已经可以将每个文件的单词抽取出来了，下面就使用一个类汇总单词信息即可。这里面为了最后结果可以从a-z按照字典排序，使用了TreeMap来统计结果，不多说，见代码：

package net.itaem.luohong.honghong;

import java.util.List;
import java.util.Map;
import java.util.TreeMap;

/**
 * 用于统计每个单词出现的次数
 * 
 * @author luohong 846705189@qq.com
 * */
public class WordCount {
	private Map<Word, Integer> wordCount;

	public WordCount(){
		wordCount = new TreeMap<Word, Integer>();
	}
	
	public int size(){
		return wordCount.size();
	}
	/**
	 * 统计一个单词出现的次数
	 * @param word 要统计的单词
	 * @return 该单词出现的次数
	 * */
	public Integer count(Word word){
		if(wordCount.containsKey(word)){
			return wordCount.put(word, wordCount.get(word) + 1);
		}else{
			return wordCount.put(word, 1);
		}
	}


	
	public void count(List<Word> wordList){
		for(Word word: wordList){
			count(word);
		}
	}

	/**
	 * 输出结果
	 * */
	public void printResult(){
		for(Word word: wordCount.keySet()){
			System.out.println(word + ":" + wordCount.get(word));
		}
	}
}

基本上整个架构代码就写完了，最后将这些业务逻辑添加到visitFile方法中即可。

<span style="font-size:18px;"><pre name="code" class="java">package net.itaem.luohong.honghong;
import static java.nio.file.FileVisitResult.CONTINUE;

import java.io.File;
import java.io.IOException;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.List;

/**
 * 递归遍历目录
 * 
 * */
public class PrintFiles extends SimpleFileVisitor<Path> {
    
	WordCount wordCount = new WordCount();
	
    // Print information about
    // each type of file.
    @Override
    public FileVisitResult visitFile(Path file,
                                   BasicFileAttributes attr) {
        
    	if (attr.isSymbolicLink()) {
            
        } else if (attr.isRegularFile()) {
            List<Word> words = WordReader.wordInFile(file.toString());
            System.out.println(file);
            for(Word word: words){
                wordCount.count(word);
            }
            //System.out.println(words);
        } else {
            //System.out.format("Other: %s ", file);
        }
    	
        return CONTINUE;
    }

    // Print each directory visited.
    @Override
    public FileVisitResult postVisitDirectory(Path dir,
                                          IOException exc) {
        //System.out.format("Directory: %s%n", dir);
        return CONTINUE;
    }

    // If there is some error accessing
    // the file, let the user know.
    // If you don't override this method
    // and an error occurs, an IOException 
    // is thrown.
    @Override
    public FileVisitResult visitFileFailed(Path file, IOException exc) {
        //System.err.println(exc);
        return CONTINUE;
    }
    
    public void count(String path){
    	Path startingDir = new File(path).toPath();
    	PrintFiles printFiles = new PrintFiles();
    	try {
			Files.walkFileTree(startingDir, printFiles);
			wordCount.printResult();
		} catch (IOException e) {
			e.printStackTrace();
		}
    }
    </span>

<span style="font-size:18px;">    
    public static void main(String[] args) {
    	String path = "E:\\jar集合\\java source code";
    	
    	System.out.println("enter the dictionary you want to count");
    	
    	path = new CommandReader().readCommand();
    	
    	new PrintFiles().count(path);
	}
    
    
}</span>

这里面使用了一个读取命令行的类，非常简单，如下

<span style="font-size:18px;">package net.itaem.luohong.honghong;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;

public class CommandReader{
	public CommandReader(){

	}

	public String readCommand(){
		BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
		String line = null;
		try {
			line = reader.readLine();
		} catch (IOException e) {
			e.printStackTrace();
		}finally{
			try {
				reader.close();
			} catch (IOException e) {
			}
		}
		return line;
	}
}</span>

输出部分结果：

<span style="font-size:18px;">addExceptionDetailMessage:2
addFirst:2
addGuardedAction:2
addIDAtPosition:3
addIORComponentToProfileInternal:3
addInvocationHandler:3
addLast:14
addNodes:3
addObject:7
addPOA:2
addPoaInactive:9
addPoaManager:2
addPrefix:2
addRemoteContactInfos:2
addReply:12
addReplyServiceContextQueue:11
addServiceContext:3
addServiceContexts:2
addTaggedProfiles:3
addToErrorSet:7
addToIORTemplate:3
addToParser:2
addToProperties:2
addTypeCodeAtPosition:4
addWork:7
addXXX:1
add_client_request_interceptor:2
add_client_request_interceptor_with_policy:1
add_component:1
add_component_to_profile:1
add_exception:1
add_in_arg:1
add_inout_arg:1
add_ior_component:1
add_ior_component_to_profile:1
add_ior_interceptor:2
add_ior_interceptor_with_policy:1
add_item:4
add_named_in_arg:1
add_named_inout_arg:1
add_named_out_arg:1
add_out_arg:1
add_reply_service_context:2
add_request_service_context:1
add_server_request_interceptor:2
add_server_request_interceptor_with_policy:1
add_value:1
added:31
addedSerialOrExtern:2
adding:15
addition:5
additional:10
addr:29
addrDisp:12
addrDisposition:12
address:30
addresses:2
addressing:17
addressingDisposition:14
adds:9
adheres:1
adjust:3
admap:2
admin:1
advance:2
advancement:1
advancing:1
advertise:1
advertised:1
ae:10
af:1
affect:3
affected:1
affecting:2
affiliates:438
after:125
afterwards:8
again:23
against:10
againt:1
aggregate:1
agree:7
agreed:2
ahead:2
ai:1
aids:2
aka:1
alert:2
algorithm:11
alias:8
aliased:4
aliases:5
aliasing:2
align:50
alignAndCheck:15
alignAndReserve:16
alignIncr:2
alignOnBoundary:12
alignResult:2
aligned:17
alignment:43
alignmentForEncoding:2
alive:3
all:295
all*:1
all?:3
allMethodInfo:11
allMethodNames:3
allMethodsThrowRemoteException:2
allocComponents:7
allocate:15
allocateDirect:1
allocateServerRequestId:4
allocateSlotId:2
allocate_slot_id:4
allocated:13
allocating:3
allocation:3
allocations:2
allocator:1
allow:32
allowIndirection:2
allowLocalOptimization:3
allowed:23
allowing:8
allows:29
almost:3</span>

总结：查阅最新的JDK文档新特性是必要的，不然每次都自己写一个递归去访问目录，那肯定会带来不必要的麻烦。并且使用JDK的这个观察者模式去访问目录，非常简单...

使用jdk 1.7的新api来统计jdk中英语单词的分布情况

原文：http://blog.csdn.net/u010469003/article/details/44625489

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)