<?php
/**
* test new page crawler
*/
#$url = ‘http://apps.microsoft.com/windows/en-us/app/fotor/6f797ba2-500d-4dee-9c5a-13c2d818c958‘;
$url = ‘https://www.microsoft.com/en-us/store/apps/adobe-photoshop-express/9wzdncrfj27n‘;
$url = trim($url);
$d = array();
$content = html_entity_decode(get($url),ENT_HTML5,‘UTF-8‘);
//pfn
$pfn = ‘‘;
if(preg_match(‘/data-pfn="(.*)">/isU‘, $content, $match)){
$d[] = $match[1];
$pfn = $match[1];
echo "pfn:".$pfn."\n";
}else {
echo ‘pfn error:‘,$url,"\n";
exit();
}
//new url
if(preg_match(‘/\[url:(.*)\]/is‘, $content,$match)){
$newurl = $match[1];
echo "url:".$newurl."\n";
}else{
echo ‘get no new url‘."\n";
exit();
}
//icon
if(preg_match(‘/class="pull-left ph-logo">.*src="(.*)".*style="background-color:(.*);.*"/isU‘, $content,$match)){
$icon = $match[1];
$backgroundcolor = $match[2];
echo "icon:".$icon."\n";
echo "backgroundcolor:".$backgroundcolor."\n";
}else{
echo ‘get no icon‘."\n";
exit();
}
//name
if(preg_match(‘/id="page-title".*itemprop="name">(.*)<\//‘,$content,$match)){
$name = $match[1];
echo "name:".$name."\n";
}else{
echo ‘get no name‘."\n";
exit();
}
//alias
if(preg_match(‘/apps\/(.*)\//isU‘,$newurl,$match)){
$alias = $match[1];
echo "alias:".$alias."\n";
}else{
echo ‘get no alias‘."\n";
exit();
}
//rating
if(preg_match(‘/class="srv_ratingsScore win-rating-average">(.*)<\//‘,$content, $match)){
$rating = $match[1];
echo "rating:".$rating."\n";
}else{
echo ‘get no rating‘."\n";
exit();
}
//rating num
if(preg_match(‘/class="win-rating-total">(.*)<\//‘,$content, $match)){
$ratingcount = trim(str_replace(‘ratings‘,‘‘,preg_replace(‘/,/‘,‘‘, $match[1])));
echo "Rating num:".$ratingcount."\n";
}else{
echo ‘get no rating num‘."\n";
exit();
}
//price
if(preg_match(‘/class="price srv_price"><span class="header-sub">(.*)<\//‘,$content, $match)){
$price = $match[1];
echo "prcie:".$price."\n";
}else{
echo ‘get no price‘."\n";
exit();
}
//category
if(preg_match(‘/<meta name="ms.prod_sbcat" content="(.*)" \/>/isU‘,$content, $match)){
$category = trim($match[1]);
echo "category:".$category."\n";
}else{
if(preg_match(‘/<meta name="ms.prod_cat" content="(.*)" \/>/isU‘,$content, $match)){
$category = trim($match[1]);
echo "category:".$category."\n";
}else{
echo ‘get no category‘."\n";
exit();
}
}
//content rating
if(preg_match(‘/Content Rating: <a .*>(.*)<\//isU‘,$content, $match)){
$contentRating = trim($match[1]);
echo "content rating:".$contentRating."\n";
}else{
echo ‘get no content rating‘."\n";
$contentRating = ‘all‘;
}
//publisher
if(preg_match(‘/Publisher<\/dt>.*<div class="content.*".*>(.+)<\//isU‘,$content, $match)){
$publisher = trim($match[1]);
echo "publisher:".$publisher."\n";
}else{
echo ‘get no publisher‘."\n";
exit();
}
//works on
if(preg_match(‘/Works on: (.*)</isU‘, $content,$match)){
$workson = trim($match[1]);
echo ‘works on:‘.$workson."\n";
}else{
echo ‘get no works platform‘."\n";
//exit();
}
//size
if(preg_match(‘/Approximate size<\/dt>.*<div class="content.*".*>(.+)<\//isU‘,$content, $match)){
$size = trim($match[1]);
echo "size:".$size."\n";
}else{
echo ‘get no size‘."\n";
//exit();
}
//supportedprocessors
if(preg_match(‘/Supported processors<\/dt>.*<div class="content.*".*>(.+)<\//isU‘,$content, $match)){
$processors = trim($match[1]);
echo "processors:".$processors."\n";
}else{
echo ‘get no processors‘."\n";
//exit();
}
//age
if(preg_match(‘/Age rating<\/dt>.*<div class="content.*".*>(.+)<\//isU‘,$content, $match)){
$age = trim($match[1]);
echo "age:".$age."\n";
}else{
echo ‘get no age‘."\n";
//exit();
}
//languages
if(preg_match(‘/Supported languages<\/dt>.*<dd .*>(.*)<\/dd>/isU‘,$content,$match)){
if(preg_match_all(‘/<div>([^<].*)<\/div>/‘,$match[1],$temp)){
$languages = implode(",",$temp[1]);
echo "languages:".$languages."\n";
}
}else{
echo ‘get no languages‘."\n";
//exit();
}
//features
if(preg_match(‘/class="section-title.*">Features.*<ul>(.*)<\/ul>/isU‘,$content,$match)){
if(preg_match_all(‘/<li class="avoid-break">(.*)<\/li>/isU‘, $match[1], $temp)){
$features = $temp[1];
echo ‘features:‘;
print_r($features);
echo "\n";
}
}else{
echo ‘get no features‘."\n";
}
//release notes
if(preg_match(‘/class="section-title.*">Version Notes.*<p>(.*)<\/p>/isU‘,$content,$match)){
$releasenotes = $match[1];
echo "release Notes:".$releasenotes."\n";
}else{
echo ‘no version notes‘."\n";
}
//screenshots
if(preg_match_all(‘/class="media-img ratio-16-9">.*<img src="(.*)".*\/>/isU‘, $content, $match)){
$screenshots = $match[1];
echo "screenshots:";
print_r($screenshots);
echo "\n";
}else{
echo ‘get no screenshots‘."\n";
exit();
}
//description
if(preg_match(‘/<div class="showmore m-t-pdp">.*<p.*>(.*)<\//isU‘, $content,$match)){
$description = $match[1];
echo ‘description:‘.$description."\n";
}else{
echo "get no description content\n";
exit();
}
exit();
function get($url) {
$ch = curl_init ($url);
curl_setopt($ch, CURLOPT_USERAGENT, ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11‘);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
$output = curl_exec ($ch);
$curlinfo = curl_getinfo($ch);
$lasturl = $curlinfo[‘url‘];
curl_close($ch);
return $output."[url:$lasturl]";
}
成品站:www.topwindata.com ,windows 10 一发布,流量就翻番了,不过还是只有1000左右ip。
原文:http://my.oschina.net/u/1170277/blog/488977