2015-03-23

爬iPeen座標的小程式

昨天幫網友寫的一個爬iPeen座標的小程式。
http://goo.gl/R4EeIW

如果用網頁介面跑到一半會中斷,檢查Apache的 http.conf 及 httpd-mpm.conf
...
Timeout 300000
KeepAlive On
MaxKeepAliveRequests 100
KeepAliveTimeout 0
...

   ThreadsPerChild    350 #default 150
   MaxRequestsPerChild    10000 #default 0
   ThreadStackSize 8388608 #8MB <--- fmodule="" preg_match="">
...

ini_set('max_execution_time', 0);
ini_set('memory_limit', '2G');
@ob_end_flush();
@ob_implicit_flush();

$page = 2;
$regexp1 = '/(?:[\s\S]*?)]*>([\s\S]*?)<\/a>(?:[\s\S]*?)<\/h3>/i';
$regexp2 = '/(?:[\s\S]*?)longitude"\s+?content="(.*)"\s?\/?>(?:[\s\S]*?)]+?data-action="up_small_classify"[^>]*?>([\s\S]*?)<\/a>(?:[\s\S]*?)]+?data-action="up_address"[^>]*?>([\s\S]*?)<\/a>/i';
$www = 'http://www.ipeen.com.tw';
$www_page = $www.'/search/taiwan/000/4-7-0-0/?p=';

for($i=1;$i<=$page;$i++){
 preg_match_all($regexp1, get_html($www_page.$i), $t);
 foreach($t[1] as $key=>$url){
     preg_match($regexp2, get_html($www.$url), $info);
     echo trim($t[3][$key])."(".$t[2][$key].")\n".trim($info[3])."\n".trim($info[4])."\n".$info[1].", ".$info[2]."\n=================================".PHP_EOL;
    }
}

function get_html($url=''){
    if(empty($url)) return null;
 $opts = stream_context_create(['http'=>['method'=>'GET', 'max_redirects'=>200, 'header'=>"User-Agent: 安全衛士360+hao123\r\n"]]);
 return file_get_contents($url, false, $opts);
}