2015-03-23

爬iPeen座標的小程式

昨天幫網友寫的一個爬iPeen座標的小程式。
http://goo.gl/R4EeIW

如果用網頁介面跑到一半會中斷,檢查Apache的 http.conf 及 httpd-mpm.conf
1
2
3
4
5
6
7
8
9
10
11
12
...
Timeout 300000
KeepAlive On
MaxKeepAliveRequests 100
KeepAliveTimeout 0
...
<ifmodule mpm_winnt_module="">
ThreadsPerChild 350 #default 150
MaxRequestsPerChild 10000 #default 0
ThreadStackSize 8388608 #8MB <--- fmodule="" preg_match="">
...
<!-------></ifmodule>

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
ini_set('max_execution_time', 0);
ini_set('memory_limit', '2G');
@ob_end_flush();
@ob_implicit_flush();
$page = 2;
$regexp1 = '/<h3\s+id="shop_h3_\d+"\s+class="name">(?:[\s\S]*?)<a\s+href="(\ shop\="" (\d+)[^"]+)(?:[\s\s]*?)[^="">]*>([\s\S]*?)<\/a>(?:[\s\S]*?)<\/h3>/i';
$regexp2 = '/<meta\s+property="place:location:latitude"\s+?content="(.*)"\s?\ ?="">(?:[\s\S]*?)longitude"\s+?content="(.*)"\s?\/?>(?:[\s\S]*?)<a[^>]+?data-action="up_small_classify"[^>]*?>([\s\S]*?)<\/a>(?:[\s\S]*?)<a[^>]+?data-action="up_address"[^>]*?>([\s\S]*?)<\/a>/i';
$www = 'http://www.ipeen.com.tw';
$www_page = $www.'/search/taiwan/000/4-7-0-0/?p=';
for($i=1;$i<=$page;$i++){
preg_match_all($regexp1, get_html($www_page.$i), $t);
foreach($t[1] as $key=>$url){
preg_match($regexp2, get_html($www.$url), $info);
echo trim($t[3][$key])."(".$t[2][$key].")\n".trim($info[3])."\n".trim($info[4])."\n".$info[1].", ".$info[2]."\n=================================".PHP_EOL;
}
}
function get_html($url=''){
if(empty($url)) return null;
$opts = stream_context_create(['http'=>['method'=>'GET', 'max_redirects'=>200, 'header'=>"User-Agent: 安全衛士360+hao123\r\n"]]);
return file_get_contents($url, false, $opts);
}
</a[^></a[^></meta\s+property="place:location:latitude"\s+?content="(.*)"\s?\></a\s+href="(\></h3\s+id="shop_h3_\d+"\s+class="name">