这是我实现的第一个版本的爬虫,局限比较大,用的算法思想也比较简单。不过通过爬虫1号,我对爬虫和搜索引擎实现机制有了更多的认识,收获还是挺大的^_^,涂鸦之作,我不知道别人如何实现的,就按自己的想法写了个,大家有兴趣可以看看,用java实现的
这是工程目录:
下面是具体代码:
package com.rgy.reptile;
import com.rgy.utils.PageUtils;
public class Entry {
public static void main(String args[]){
String url = "http://www.youku.com";
PageUtils.history_list.add(url);
PageUtils.parent_stack.push(url);
PageUtils.searchUrl(url);
//PageUtils.hrefShow(url);
}
}package com.rgy.entity;
import java.util.ArrayList;
public class PageInfo {
private String url;
private String title;
private String keywords;
private ArrayList<String> href_list;
public PageInfo(){
this.url="";
this.title="";
this.keywords="";
this.href_list=null;
}
public void setUrl(String url){
this.url = url;
}
public void setTitle(String title){
this.title = title;
}
public void setKeywords(String keywords){
this.keywords = keywords;
}
public void setHref_list(ArrayList<String> href_list){
this.href_list = href_list;
}
public String getUrl(){
return url;
}
public String getTitle(){
return title;
}
public String getKeywords(){
return keywords;
}
public ArrayList<String> getHref_list(){
return href_list;
}
}
package com.rgy.utils;
import java.util.ArrayList;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.rgy.entity.PageInfo;
public class PageUtils {
public static PageInfo getPageInfo(String url){
PageInfo info = new PageInfo();
if(url.endsWith("/")){
url = url.substring(0, url.length()-1);
}
info.setUrl(url);
try{
Document doc = Jsoup.connect(url).timeout(30000).get();
String title = doc.title().toString();
info.setTitle(title);
String keywords = doc.getElementsByTag("meta").select("[name=keywords]").attr("content");
info.setKeywords(keywords);
Elements links = doc.getElementsByTag("a");
ArrayList<String> href_list = new ArrayList<String>();
for (Element link : links) {
String linkHref = link.attr("href");
if(linkHref.endsWith("/")){
linkHref = linkHref.substring(0, linkHref.length()-1);
}
//如果数组中不存在这个链接
if(linkIsAvailable(linkHref)&&!href_list.contains(linkHref)){
href_list.add(linkHref);
info.setHref_list(href_list);
}
}
}catch(Exception ex){
ex.printStackTrace();
}
return info;
}
public static boolean linkIsAvailable(String url){
if(url.startsWith("http://")){
String regex = ".*.exe|.*.apk|.*.zip|.*.rar|.*.pdf|.*.doc";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(url);
return !matcher.matches();
}
return false;
}
public static boolean keywordsIsAvailable(String keywords){
String regex = ".*青春.*|.*搞笑.*|.*微电影.*|.*短片.*|.*迷你剧.*|.*喜剧.*";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(keywords);
return matcher.matches();
}
//存放已经访问过的url
public static ArrayList<String> history_list = new ArrayList<String>();
//记录一路走来的父结点
public static Stack<String> parent_stack = new Stack<String>();
public static void searchUrl(String url){
PageInfo info = getPageInfo(url);
String keywords = info.getKeywords();
int hlist_size = history_list.size();
System.out.println(hlist_size+"-->"+history_list.get(hlist_size-1));
// if(keywordsIsAvailable(keywords)){//如果匹配上了,
// System.out.println(url+"===>"+keywords);
// }
ArrayList<String> href_list = info.getHref_list();
if(href_list==null){//该结点不可用,回到父亲结点继续走0
parent_stack.pop();
if(!parent_stack.empty()){//不为空栈
searchUrl(parent_stack.peek());
}else{//空栈
System.out.println("Yir,爬虫1号已完成任务!!!");
}
}else{//结点可用
int size = href_list.size();
for(int i=0;i<size;i++){
String strUrl = href_list.get(i);
if(history_list.contains(strUrl)){//如果当前链接已经被访问过了
continue;
}
history_list.add(strUrl);
parent_stack.push(strUrl);
searchUrl(strUrl);
}
}
}
public static void hrefShow(String url){
PageInfo info = getPageInfo(url);
ArrayList<String> href_list = info.getHref_list();
int size = href_list.size();
for(int i=0;i<size;i++){
System.out.println(href_list.get(i));
}
}
}
http://download.csdn.net/detail/u011700203/8410597
原文:http://blog.csdn.net/u011700203/article/details/43308269