php爬虫 phpspider

时间：2016-12-09 18:29:57 阅读：195 评论：0 收藏：0 [点我收藏+]

<?php
/**
 * Created by PhpStorm.
 * User: brady
 * Date: 2016/12/9
 * Time: 17:32
 */
ini_set("memory_limit", "1024M");
require dirname(__FILE__).‘/../core/init.php‘;

$url = "http://www.epooll.com/archives/806/";
$html = requests::get($url);
// 抽取文章标题
$selector = "//*[@id=\"content\"]/div[1]/div[1]/h1/a";

$title = selector::select($html, $selector);
// 检查是否抽取到标题
// 抽取文章作者
$selector = "//*[@id=\"content\"]/div[1]/div[1]/h6/span[1]";
$author = selector::select($html, $selector);
// 检查是否抽取到作者
// 去掉 作者：
$author = str_replace("作者：", "", $author);
//发布时间
$selector = "//*[@id=\"content\"]/div[1]/div[1]/h6/span[2]";
$time = selector::select($html, $selector);
$time = str_replace("发布时间：",‘‘, $time);
$time  = date("Y-m-d H:i:s",strtotime($time));
// 抽取文章内容
$selector = "//*[@id=\"content\"]/div[1]/div[2]";
$content = selector::select($html, $selector);
// 检查是否抽取到内容
$data = array(
    ‘article_title‘ => $title,
    ‘article_author‘ => $author,
    ‘article_content‘ => $content,
);
// 查看数据是否正常
$res = db::insert("content", $data);
var_dump($res);

php爬虫 phpspider

原文：http://www.cnblogs.com/php-linux/p/6150558.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)