哈哈,你想知道jdk源码中的英语单词分布情况么?下面我们介绍一种jdk1.7的新api来递归遍历文件夹。这个新的api位于java.nio.file.*下面,里面添加了更加符合语义的Path,也就是路劲这个对象。然后为了更好遍历目录,提供了一种访问者设计模式来访问递归访问目录。主要的框架代码如下:
<span style="font-size:18px;">public static class PrintFiles extends SimpleFileVisitor<Path> { // Print information about // each type of file. @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attr) { if (attr.isSymbolicLink()) { System.out.format("Symbolic link: %s ", file); } else if (attr.isRegularFile()) { System.out.format("Regular file: %s ", file); } else { System.out.format("Other: %s ", file); } System.out.println("(" + attr.size() + "bytes)"); return CONTINUE; } // Print each directory visited. @Override public FileVisitResult postVisitDirectory(Path dir, IOException exc) { System.out.format("Directory: %s%n", dir); return CONTINUE; } // If there is some error accessing // the file, let the user know. // If you don't override this method // and an error occurs, an IOException // is thrown. @Override public FileVisitResult visitFileFailed(Path file, IOException exc) { System.err.println(exc); return CONTINUE; } }</span>这个接口主要有三个方法,访问文件、访问目录、访问失败。很明显,我们只需要继承该类,然后实现相关业务逻辑即可。当需要遍历某个目录是,直接调用Files.walkFileTree(startingDir, SimpleFileVistor)方法即可。其中上面的例子什么也不做,只是简单的打印出一些相关信息。
进入正题,怎么统计JDK源码中的英语单词分布呢?下面先分解下我们的任务。在这里面,需要一个类用于解析一个.java文件,并且将里面的单词都抽取出来,好吧,该类不难,无非就是读取类文件内容,然后使用相关的方式,将这些杂乱的内容变成单词即可。为了更加容易管理,这里将单词也抽象为一个类,具体代码如下:
package net.itaem.luohong.honghong; public class Word implements Comparable<Word> { private String word; public Word(String word){ this.word = word; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((word == null) ? 0 : word.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; Word other = (Word) obj; if (word == null) { if (other.word != null) return false; } else if (!word.equals(other.word)) return false; return true; } @Override public String toString() { return word; } public String getWord(){ return word; } public int compareTo(Word o) { if(this.getWord() == null){ return -1; } if(o.getWord() == null){ return 1; } return this.getWord().compareTo(o.getWord()); } }
package net.itaem.luohong.honghong; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import java.util.StringTokenizer; /** * 读取一个文件的内容 * */ public class WordReader { public static List<Word> wordInFile(String path){ return stringToWords(read(path)); } /** * 读取一个文件的内容,然后将内容返回 * * */ private static String read(String path){ StringBuilder sb = new StringBuilder(); try { BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(path)))); String line = null; while((line = reader.readLine()) != null){ sb.append(line); } reader.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return sb.toString(); } /** * 将字符串变成单词数组 * */ private static List<Word> stringToWords(String source){ StringTokenizer strToken = new StringTokenizer(source); List<Word> wordList = new ArrayList<Word>(); while(strToken.hasMoreTokens()){ String wordStr = strToken.nextToken(); if(!isInvalidString(wordStr)){ wordStr = removeInvalidStr(wordStr); //确保去除干净 wordStr = removeInvalidStr(wordStr); if(wordStr != null && !"".equals(wordStr)){ if(needSplitToWord(wordStr)){ recurSplitToWord(wordStr, wordList); }else{ //不需要切割 if(wordStr != null && !"".equals(wordStr) && !isInvalidString(wordStr)){ wordList.add(new Word(removeInvalidStr(wordStr))); } } } } } return wordList; } /** * 判断字符串是否需要分割为多个单词 * @param wordStr * @return * */ private static boolean needSplitToWord(String wordStr){ if(wordStr == null || "".equals(wordStr)) return false; if(wordStr.contains(".") || wordStr.contains("/") || wordStr.contains("-") || wordStr.contains("#") || wordStr.contains("]") || wordStr.contains(",") || wordStr.contains("(") || wordStr.contains("[") || wordStr.contains(">") || wordStr.contains("<") || wordStr.contains("=")){ return true; }else{ return false; } } /** * 递归切割字符串为单词列表 * 因为一个字符串可能同时包含多个分割单词的字符,所以需要递归切割 * * @param wordStr 要切割的字符串 * @param wordList 将切割的单词放入该list * */ private static void recurSplitToWord(String wordStr, List<Word> wordList){ if(wordStr == null) return; if(needSplitToWord(wordStr)){ String[] words = splitToWords0(wordStr); for(String word: words){ if(needSplitToWord(word)){ recurSplitToWord(word, wordList); }else{ if(word != null && !"".equals(word) && !isInvalidString(word)){ wordList.add(new Word(removeInvalidStr(word))); } } } } } /** * 将一个字符串切割为单词数组 * @param str * @return * */ private static String[] splitToWords0(String wordStr){ String[] words = null; //split word if(wordStr.contains(".")){ words = wordStr.split("\\."); }else if(wordStr.contains("/")){ words = wordStr.split("/"); }else if(wordStr.contains("-")){ words = wordStr.split("-"); }else if(wordStr.contains("#")){ words = wordStr.split("#"); }else if(wordStr.contains("]")){ words = wordStr.split("]"); }else if(wordStr.contains("[")){ words = wordStr.split("\\["); }else if(wordStr.contains(",")){ words = wordStr.split(","); }else if(wordStr.contains("(")){ words = wordStr.split("\\("); }else if(wordStr.contains(">")){ words = wordStr.split(">"); }else if(wordStr.contains("<")){ words = wordStr.split("<"); }else if(wordStr.contains("=")){ words = wordStr.split("="); } return words; } /** * 去掉 + - * / >= <= =!= ||等无效字符 * @param str 字符串 * @return * */ private static boolean isInvalidString(String str){ if(str == null || str.equals("") || str.equals("*") || str.equals("{") ||str.equals("}") || str.equals("+") ||str.equals("-") || str.equals("/") || str.equals("=") || str.equals("!=") || str.equals(">") || str.equals("<") || str.equals(">=") || str.equals("<=") || str.equals("||") || str.equals("}}") || str.equals("/*") || str.equals("};") || str.equals("+=") || str.matches("\\d+") || str.equals(".") || str.equals(":") || str.equals("/**") || str.equals("//") || str.endsWith("==") || str.equals("?") || (str.contains("==") && str.contains("&") && str.contains(";")) //去掉o==null ? get这类字符 ){ return true; }else{ return false; } } /** * 判断一个字符串是否是数字 * */ private static boolean isNumber(String str){ return str.matches("^[-+]?(([0-9]+)([.]([0-9]+))?|([.]([0-9]+))?)$"); } /** * 去掉一个字符串中的无效字符 * @param * @return * */ private static String removeInvalidStr(String wordStr){ if(isInvalidString(wordStr)){ return null; } //去掉结尾; if(wordStr.endsWith(";")){ wordStr = wordStr.substring(0, wordStr.length() - 1); } //去掉尾部, if(wordStr.endsWith(",")){ wordStr = wordStr.substring(0, wordStr.length() - 1); } //去掉结尾() if(wordStr.endsWith("()")){ wordStr = wordStr.substring(0, wordStr.length() - 2); } //去掉开头( if(wordStr.startsWith("(")){ wordStr = wordStr.substring(1); } //去掉结尾) if(wordStr.endsWith(")")){ wordStr = wordStr.substring(0, wordStr.length() - 1); } //去掉结尾: if(wordStr.endsWith(":")){ wordStr = wordStr.substring(0, wordStr.length() - 1); } //去掉开头" if(wordStr.startsWith("\"")){ wordStr = wordStr.substring(1); } //去掉结尾" if(wordStr.endsWith("\"")){ wordStr = wordStr.substring(0, wordStr.length() - 1); } //去掉结尾处. if(wordStr.endsWith(".")){ wordStr = wordStr.substring(0, wordStr.length() - 1); } //去掉开头*/ if(wordStr.startsWith("*/")){ wordStr = wordStr.substring(2); } //去掉java.util;/**结尾处的四个字符 if(wordStr.endsWith(";/**")){ wordStr = wordStr.substring(0, wordStr.length() - 4); } //去掉开头的{@ if(wordStr.startsWith("{@")){ wordStr = wordStr.substring(2); } //去掉开头的@ if(wordStr.startsWith("@")){ wordStr = wordStr.substring(1); } //取出下面该格式的单词,比如:<tt>hello</tt>取出hello,<li>world</li>取出为world,<T>取出为T,</pre>取出为pre,<pre>取出为pre if(wordStr.startsWith("<") && wordStr.endsWith(">")){ if(wordStr.startsWith("<") && !wordStr.contains("</")){ //格式为<T> wordStr = wordStr.substring(wordStr.indexOf("<") + 1, wordStr.lastIndexOf(">")); }else if(wordStr.contains("</") && !wordStr.startsWith("</")){ //格式为:<tt>hello</tt> wordStr = wordStr.substring(wordStr.indexOf(">") + 1, wordStr.lastIndexOf("</")); }else if(wordStr.startsWith("</")){ //格式为</pre> wordStr = wordStr.substring(2, wordStr.lastIndexOf(">")); } } //去掉<li>time中的<li> if(wordStr.contains("<") && wordStr.contains(">") && !wordStr.contains("/") && wordStr.startsWith("<")){ wordStr = wordStr.substring(wordStr.lastIndexOf(">") + 1); } //去掉time<li>中的<li> if(wordStr.contains("<") && wordStr.contains(">") && !wordStr.contains("/") && wordStr.endsWith(">")){ wordStr = wordStr.substring(0, wordStr.lastIndexOf("<")); } //去掉time</li> if(wordStr.contains("</") && wordStr.contains(">")){ wordStr = wordStr.substring(0, wordStr.lastIndexOf("</")); } //去掉开头的< if(wordStr.startsWith("<")){ wordStr = wordStr.substring(1); } //去掉结尾的> if(wordStr.endsWith(">")){ wordStr = wordStr.substring(0, wordStr.length() -1); } //去掉结尾的[ if(wordStr.startsWith("{")){ wordStr = wordStr.substring(1); } //去掉开头的[ if(wordStr.endsWith("}")){ wordStr = wordStr.substring(0, wordStr.length() - 1); } //去掉开头的== if(wordStr.startsWith("==")){ wordStr = wordStr.substring(2); } //去掉结尾的= if(wordStr.endsWith("=")){ wordStr = wordStr.substring(0, wordStr.length() - 1); } //去掉结尾的{ //去掉开头的[ if(wordStr.endsWith("{")){ wordStr = wordStr.substring(0, wordStr.length() - 1); } //去掉结尾的[ if(wordStr.endsWith("[]")){ wordStr = wordStr.substring(0, wordStr.length() - 2); } //去掉结尾的[ if(wordStr.endsWith("]")){ wordStr = wordStr.substring(0, wordStr.length() - 1); } //去掉结尾的[ if(wordStr.endsWith("[")){ wordStr = wordStr.substring(0, wordStr.length() - 1); } //去掉开头的[ if(wordStr.startsWith("]")){ wordStr = wordStr.substring(1); } //去掉开头的+ if(wordStr.startsWith("+")){ wordStr = wordStr.substring(1); } //去掉<? if(wordStr.endsWith("+")){ wordStr = wordStr.substring(0, wordStr.length() - 1); } //去掉<? if(wordStr.endsWith("<?")){ wordStr = wordStr.substring(0, wordStr.length() - 2); } //去掉" if(wordStr.contains("\"")){ wordStr = wordStr.replace("\"", ""); } //去掉开头的[ //去掉数字 if(isNumber(wordStr)){ return null; } return wordStr; } }当然,已经可以将每个文件的单词抽取出来了,下面就使用一个类汇总单词信息即可。这里面为了最后结果可以从a-z按照字典排序,使用了TreeMap来统计结果,不多说,见代码:
package net.itaem.luohong.honghong; import java.util.List; import java.util.Map; import java.util.TreeMap; /** * 用于统计每个单词出现的次数 * * @author luohong 846705189@qq.com * */ public class WordCount { private Map<Word, Integer> wordCount; public WordCount(){ wordCount = new TreeMap<Word, Integer>(); } public int size(){ return wordCount.size(); } /** * 统计一个单词出现的次数 * @param word 要统计的单词 * @return 该单词出现的次数 * */ public Integer count(Word word){ if(wordCount.containsKey(word)){ return wordCount.put(word, wordCount.get(word) + 1); }else{ return wordCount.put(word, 1); } } public void count(List<Word> wordList){ for(Word word: wordList){ count(word); } } /** * 输出结果 * */ public void printResult(){ for(Word word: wordCount.keySet()){ System.out.println(word + ":" + wordCount.get(word)); } } }
<span style="font-size:18px;"><pre name="code" class="java">package net.itaem.luohong.honghong; import static java.nio.file.FileVisitResult.CONTINUE; import java.io.File; import java.io.IOException; import java.nio.file.FileVisitResult; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.SimpleFileVisitor; import java.nio.file.attribute.BasicFileAttributes; import java.util.List; /** * 递归遍历目录 * * */ public class PrintFiles extends SimpleFileVisitor<Path> { WordCount wordCount = new WordCount(); // Print information about // each type of file. @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attr) { if (attr.isSymbolicLink()) { } else if (attr.isRegularFile()) { List<Word> words = WordReader.wordInFile(file.toString()); System.out.println(file); for(Word word: words){ wordCount.count(word); } //System.out.println(words); } else { //System.out.format("Other: %s ", file); } return CONTINUE; } // Print each directory visited. @Override public FileVisitResult postVisitDirectory(Path dir, IOException exc) { //System.out.format("Directory: %s%n", dir); return CONTINUE; } // If there is some error accessing // the file, let the user know. // If you don't override this method // and an error occurs, an IOException // is thrown. @Override public FileVisitResult visitFileFailed(Path file, IOException exc) { //System.err.println(exc); return CONTINUE; } public void count(String path){ Path startingDir = new File(path).toPath(); PrintFiles printFiles = new PrintFiles(); try { Files.walkFileTree(startingDir, printFiles); wordCount.printResult(); } catch (IOException e) { e.printStackTrace(); } } </span>
<span style="font-size:18px;"> public static void main(String[] args) { String path = "E:\\jar集合\\java source code"; System.out.println("enter the dictionary you want to count"); path = new CommandReader().readCommand(); new PrintFiles().count(path); } }</span>
<span style="font-size:18px;">package net.itaem.luohong.honghong; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; public class CommandReader{ public CommandReader(){ } public String readCommand(){ BufferedReader reader = new BufferedReader(new InputStreamReader(System.in)); String line = null; try { line = reader.readLine(); } catch (IOException e) { e.printStackTrace(); }finally{ try { reader.close(); } catch (IOException e) { } } return line; } }</span>
输出部分结果:
<span style="font-size:18px;">addExceptionDetailMessage:2 addFirst:2 addGuardedAction:2 addIDAtPosition:3 addIORComponentToProfileInternal:3 addInvocationHandler:3 addLast:14 addNodes:3 addObject:7 addPOA:2 addPoaInactive:9 addPoaManager:2 addPrefix:2 addRemoteContactInfos:2 addReply:12 addReplyServiceContextQueue:11 addServiceContext:3 addServiceContexts:2 addTaggedProfiles:3 addToErrorSet:7 addToIORTemplate:3 addToParser:2 addToProperties:2 addTypeCodeAtPosition:4 addWork:7 addXXX:1 add_client_request_interceptor:2 add_client_request_interceptor_with_policy:1 add_component:1 add_component_to_profile:1 add_exception:1 add_in_arg:1 add_inout_arg:1 add_ior_component:1 add_ior_component_to_profile:1 add_ior_interceptor:2 add_ior_interceptor_with_policy:1 add_item:4 add_named_in_arg:1 add_named_inout_arg:1 add_named_out_arg:1 add_out_arg:1 add_reply_service_context:2 add_request_service_context:1 add_server_request_interceptor:2 add_server_request_interceptor_with_policy:1 add_value:1 added:31 addedSerialOrExtern:2 adding:15 addition:5 additional:10 addr:29 addrDisp:12 addrDisposition:12 address:30 addresses:2 addressing:17 addressingDisposition:14 adds:9 adheres:1 adjust:3 admap:2 admin:1 advance:2 advancement:1 advancing:1 advertise:1 advertised:1 ae:10 af:1 affect:3 affected:1 affecting:2 affiliates:438 after:125 afterwards:8 again:23 against:10 againt:1 aggregate:1 agree:7 agreed:2 ahead:2 ai:1 aids:2 aka:1 alert:2 algorithm:11 alias:8 aliased:4 aliases:5 aliasing:2 align:50 alignAndCheck:15 alignAndReserve:16 alignIncr:2 alignOnBoundary:12 alignResult:2 aligned:17 alignment:43 alignmentForEncoding:2 alive:3 all:295 all*:1 all?:3 allMethodInfo:11 allMethodNames:3 allMethodsThrowRemoteException:2 allocComponents:7 allocate:15 allocateDirect:1 allocateServerRequestId:4 allocateSlotId:2 allocate_slot_id:4 allocated:13 allocating:3 allocation:3 allocations:2 allocator:1 allow:32 allowIndirection:2 allowLocalOptimization:3 allowed:23 allowing:8 allows:29 almost:3</span>
使用jdk 1.7的新api来统计jdk中英语单词的分布情况
原文:http://blog.csdn.net/u010469003/article/details/44625489