首页 > 编程语言 > 详细

java爬虫 案例

时间:2020-12-25 14:45:57      阅读:23      评论:0      收藏:0      [点我收藏+]
package com.zjazn;

import com.sun.org.apache.bcel.internal.generic.RETURN;
import com.sun.xml.internal.ws.api.server.InstanceResolver;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import sun.net.www.http.HttpClient;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;

public class Data {


    public static void main(String[] args) {
        String html = getData();
        Document htmledThisDocument = Jsoup.parse(html);
        List<MyData> myData=new ArrayList<MyData>();
        Elements courses = htmledThisDocument.select(".learn-path-container>div");
        for (Element course:courses){
            String courseName = course.select("a>div").first().text();
            String courseNum = course.select("a>div").last().text();
            if(courseNum.indexOf("门")>-1){
                int num = Integer.parseInt(courseNum.substring(0, courseNum.indexOf("门")));
                String imgPath = course.select("a>img").attr("src");
                String fuffix = imgPath.substring(imgPath.lastIndexOf("."));
                MyData myData6 = new MyData();
                    myData6.setName(courseName);
                    myData6.setImgPath(imgPath);
                    myData6.setNum(num);
                myData.add(myData6);
                downloadFile(imgPath,"E://myimg",courseName+fuffix);
            }
        }
        System.out.println(myData.toString());


    }
    public static String getData(){
        CloseableHttpClient httpClient = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet("https://www.lanqiao.cn/paths/");

        CloseableHttpResponse response=null;
        HttpEntity entity=null;
        String html=null;
        try {
            response = httpClient.execute(httpGet);//发送请求

            if(response.getStatusLine().getStatusCode() ==200){
                entity = response.getEntity();//获取html
                html= EntityUtils.toString(entity,"UTF-8");//用指定编码解析html
            }
            return html;
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;

    }

    public static void downloadFile(String urlStr,String directory,String fileName){
        FileOutputStream out =null;
        InputStream in=null;
        try {
            URL url=new URL(urlStr);
            URLConnection urlConnection = url.openConnection();
            in=urlConnection.getInputStream();
            byte[] buf=new byte[1000];
            File dir = new File(directory);
            if(!dir.exists() ){
                dir.mkdir();
            }
            out=new FileOutputStream(directory+"\\"+fileName);
            int len=-1;
            while ((len=in.read(buf))!=-1){
                out.write(buf,0,len);

            }

        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                if(in != null){
                    in.close();
                }
                if (out !=null){
                    out.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

 

package com.zjazn;

import lombok.Data;

@Data
public class MyData {
    private String name;
    private String imgPath;
    private Integer num;

}

 

java爬虫 案例

原文:https://www.cnblogs.com/zjazn/p/14188395.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!