一、
- ConnectionManager manager = Page.getConnectionManager();
- Parser parser = new Parser(manager
- .openConnection("http://www.verycd.com/topics/2760827/"));
- parser.setEncoding("GBK");
-
-
-
-
- NodeFilter filter = new AndFilter(new TagNameFilter("input"),
- new HasAttributeFilter("class","forminput"));
-
-
- NodeList nodeList = parser.extractAllNodesThatMatch(filter);
-
- NodeIterator it = nodeList.elements();
- while (it.hasMoreNodes()) {
- Node node = it.nextNode();
- System.out.println(node.toHtml());
- }
- }
二、
- public static void extracLinks(String url) {
- try {
- Parser parser = new Parser(url);
- parser.setEncoding("gb2312");
- NodeFilter frameFilter = new NodeFilter() {
- public boolean accept(Node node) {
- if (node.getText().startsWith("frame src=")) {
- return true;
- } else {
- return false;
- }
- }
- };
- OrFilte rorFilter = new OrFilter(new NodeClassFilter(LinkTag.class), new
- NodeClassFilter(ImageTag.class));
- OrFilter linkFilter = new OrFilter(orFilter, frameFilter);
-
- NodeList list = parser.extractAllNodesThatMatch(linkFilter);
- for (int i = 0; i < list.size(); i++) {
- Node tag = list.elementAt(i);
- if (tag instanceof LinkTag)
- {
- LinkTag link = (LinkTag) tag;
- String linkUrl = link.getLink();
- String text = link.getLinkText();
- System.out.println(linkUrl + "**********" + text);
- }
- else if (tag instanceof ImageTag)
- {
- ImageTag image = (ImageTag) list.elementAt(i);
- System.out.print(image.getImageURL() + "********");
- System.out.println(image.getText());
- }
- else
- {
- String frame = tag.getText();
- int start = frame.indexOf("src=");
- frame = frame.substring(start);
- int end = frame.indexOf(" ");
- if (end == -1)
- end = frame.indexOf(">");
- frame = frame.substring(5, end - 1);
- System.out.println(frame);
- }
- }
- } catch (ParserException e) {
- e.printStackTrace();
- }
- }
三、
- public void test2() throws ParserException{
- ConnectionManager manager = Page.getConnectionManager();
- Parser parser = new Parser(manager.openConnection("http://www.verycd.com/sto/datum/computer/page1"));
-
- NodeFilter filter = new AndFilter(new TagNameFilter("a"),new HasParentFilter(new TagNameFilter("h3")));
-
- NodeList nodes = parser.parse(filter);
- NodeIterator it = nodes.elements();
- while (it.hasMoreNodes()) {
- Node node = it.nextNode();
- if (node instanceof LinkTag) {
- LinkTag linkNode = (LinkTag) node;
- System.out.println("http://www.verycd.com"+linkNode.getAttribute("href"));
- }
- }
-
- }
四、
- ConnectionManager manager = Page.getConnectionManager();
- Parser parser = new Parser(manager.openConnection("http://huodong.sodao.com/39/info#"));
-
- NodeFilter filter = new StringFilter("减价幅度");
-
- NodeList nodes = parser.parse(filter);
- NodeIterator it = nodes.elements();
- while (it.hasMoreNodes()) {
- Node node = it.nextNode();
-
- System.out.println(node.getParent().toHtml());
五、
六、
- public static void getWords(String url) throws ParserException {
- Parser parser = new Parser(url);
- NodeFilter filter = new HasAttributeFilter("id", "word_more_con");
- NodeList nodelist = parser.extractAllNodesThatMatch(filter);
- NodeFilter filter1 = new NodeClassFilter(LinkTag.class);
- nodelist = nodelist.extractAllNodesThatMatch(filter1, true);
- for (int i = 0; i < nodelist.size(); i++) {
- LinkTag link = (LinkTag) nodelist.elementAt(i);
- System.out.println(link.getLinkText() + ":" + link.getLink());
- }
- }
七、处理html里面的table
- public List getChangyu(String id) throws HttpException, IOException, ParserException
- {
- HttpClient hc=new HttpClient();
- hc.getParams().setContentCharset("gb2312");
- PostMethod pm=new PostMethod("http://61.145.121.47/custSearch.jsp");
- pm.setParameter("bnos","111111111111");
- hc.executeMethod(pm);
- String temp=pm.getResponseBodyAsString(50000);
-
- Parser parser=new Parser(temp);
- NodeFilter filter=new HasAttributeFilter("cellpadding","-1");
- NodeList nodelist=parser.extractAllNodesThatMatch(filter);
-
- if(nodelist.size()>0)
- {
- NodeFilter filter1=new NodeClassFilter(TableTag.class);
- nodelist=nodelist.extractAllNodesThatMatch(filter1,true);
- if(nodelist.size()>0)
- {
- TableTag table=(TableTag) nodelist.elementAt(0);
- TableRow [] rows=table.getRows();
- if(rows.length>1)
- {
- for(int i=2;i<rows.length;i++)
- {
- TableRow row=rows[i];
- TableColumn td=row.getColumns()[0];
- TableColumn td1=row.getColumns()[1];
- System.out.println(td.toPlainTextString()+" "+td1.toPlainTextString());
- }
- }
- }
- }
- else
- {
-
- }
- return null;
- }
八、查询IP
- public static void getIp(String ip) throws ParserException
- {
- String url="http://www.ip138.com/ips.asp?ip="+ip;
- Parser parser=new Parser(url);
- parser.setEncoding("gb2312");
- NodeList nodelist=null;
- NodeFilter filter=new HasAttributeFilter("class","ul1");
- nodelist=parser.extractAllNodesThatMatch(filter);
- if(nodelist.size()>0)
- {
- System.out.println("您查询的IP为:"+ip);
- System.out.println(nodelist.elementAt(0).toPlainTextString());
- }
-
- }
- public static void main(String[] args) throws ParserException {
- getIp("125.33.192.180");
- }
九、提取html里面的p标签里面的内容
- public static ArrayList <String> getParagraphList( String content){
-
- ArrayList <String> paraList=new ArrayList <String>();
-
- Parser myParser=new Parser();
- NodeList nodeList = null;
-
- NodeFilter paraFilter = new NodeClassFilter(ParagraphTag.class);
- try {
-
- myParser.setInputHTML(content);
- nodeList = myParser.parse(paraFilter);
- for (int i = 0; i <= nodeList.size(); i++) {
-
- ParagraphTag tag = (ParagraphTag) nodeList.elementAt(i);
-
- if(tag!=null){
-
-
- paraList.add(tag.getStringText());
- }
-
- }
-
- } catch (ParserException e) {
- e.printStackTrace();
- }
-
- return paraList;
-
- }
htmlparser 精确提取的一些代码
原文:http://www.cnblogs.com/Alex80/p/4775840.html