今天公司PHP牛人教了PHP采集系统的原理^_^,太牛了!
代码如下
<?php
//获得网页内容
function getFileContents($url) {
$user_agent="User-Agent: Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Windows 2000; Windows XP)";
$urlparts = parse_url($url);
$path = $urlparts['path'];
$host = $urlparts['host'];
if (!empty($urlparts['query']))
$path .= "?".$urlparts['query'];
if (isset ($urlparts['port'])) {
$port = (int) $urlparts['port'];
} else
if ($urlparts['scheme'] == "http") {
$port = 80;
} else
if ($urlparts['scheme'] == "https") {
$port = 443;
}
if ($port == 80) {
$portq = "";
} else {
$portq = ":$port";
}
$all = "*/*";
$request = "GET $path HTTP/1.0rnHost: $host$portqrnAccept: $allrnAccept-Encoding: identityrnUser-Agent: $user_agentrnrn";
$fsocket_timeout = 60;
if (substr($url, 0, 5) == "https") {
$target = "ssl://".$host;
} else {
$target = $host;
}
$errno = 0;
$errstr = "";
$fp = @ fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
if (!$fp) {
$contents['state'] = "NOHOST";
print "Error: $errstr";
return $contents;
} else {
if (!fputs($fp, $request)) {
$contents['state'] = "Cannot send request";
return $contents;
}
$data = null;
socket_set_timeout($fp, $fsocket_timeout);
$status = socket_get_status($fp);
while (!feof($fp) && !$status['timed_out']) {
$data .= fgets($fp, 8192);
}
fclose($fp);
if ($status['timed_out'] == 1) {
$contents['state'] = "timeout";
} else{
if(strstr($data,"Location: ")&&strstr($data,"Cache-Control: private")){
$contents['state'] = "jump";
$contents['file'] = substr($data, strpos($data, "rnrn") + 4);
}
else{
$contents['state'] = "ok";
$contents['file'] = substr($data, strpos($data, "rnrn") + 4);
}
}
}
return $contents;
}
/*
检查url文件是否可以读取
check if file is available and in readable form
*/
function url_status($url) {
$user_agent="User-Agent: Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Windows 2000; Windows XP)";
$urlparts = parse_url($url);
$path = $urlparts['path'];
$host = $urlparts['host'];
if (!empty($urlparts['query']))
$path .= "?".$urlparts['query'];
if (isset ($urlparts['port'])) {
$port = (int) $urlparts['port'];
} else
if ($urlparts['scheme'] == "http") {
$port = 80;
} else
if ($urlparts['scheme'] == "https") {
$port = 443;
}
if ($port == 80) {
$portq = "";
} else {
$portq = ":$port";
}
$all = "*/*"; //just to prevent "comment effect" in get accept
$request = "HEAD $path HTTP/1.1rnHost: $host$portqrnAccept: $allrnAccept-Charset: iso-8859-1rnAccept-Encoding: identityrnUser-Agent: $user_agentrnrn";
if (substr($url, 0, 5) == "https") {
$target = "ssl://".$host;
} else {
$target = $host;
}
$fsocket_timeout = 60;
$errno = 0;
$errstr = "";
$fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
$linkstate = "ok";
if (!$fp) {
$status['state'] = "NOHOST";
} else {
socket_set_timeout($fp, $fsocket_timeout);
fputs($fp, $request);
$answer = fgets($fp, 4096);
$regs = Array ();
if (ereg("HTTP/[0-9.]+ (([0-9])[0-9]{2})", $answer, $regs)) {
$httpcode = $regs[2];
$full_httpcode = $regs[1];
if ($httpcode <> 2 && $httpcode <> 3) {
$status['state'] = "Unreachable: http $full_httpcode";
$linkstate = "Unreachable";
}
}
if ($linkstate <> "Unreachable") {
while ($answer) {
$answer = fgets($fp, 4096);
if (ereg("Location: *([^nr ]+)", $answer, $regs) && $httpcode == 3 && $full_httpcode != 302) {
$status['path'] = $regs[1];
$status['state'] = "Relocation: http $full_httpcode";
fclose($fp);
return $status;
}
if (eregi("Last-Modified: *([a-z0-9,: ]+)", $answer, $regs)) {
$status['date'] = $regs[1];
}
if (eregi("Content-Type:", $answer)) {
$content = $answer;
$answer = '';
break;
}
}
$socket_status = socket_get_status($fp);
if (eregi("Content-Type: *([a-z/]*)", $content, $regs)) {
if ($regs[1] == 'text/html' || $regs[1] == 'text/' || $regs[1] == 'text/plain') {
$status['content'] = 'text';
$status['state'] = 'ok';
} else if ($regs[1] == 'application/pdf') {
$status['content'] = 'pdf';
$status['state'] = 'ok';
} else if ($regs[1] == 'application/msword') {
$status['content'] = 'doc';
$status['state'] = 'ok';
} else {
$status['state'] = "Not text or html";
}
} else
if ($socket_status['timed_out'] == 1) {
$status['state'] = "Timed out (no reply from server)";
} else
$status['state'] = "Not text or html";
}
}
fclose($fp);
return $status;
}
$host = 'http://www.admin5.com';
$list_exp = '<div class="itembox"';
$url_start = '<a href="';
$url_end = '" target=';
$detail_title_start = '<h1>';
$detail_title_end = '</h1>';
$detail_summary_start = '<div id="arctext">';
$detail_summary_end = '<div id="arctext">';
$max_page = 179;
for($page=$max_page;$page>0;$page--){
$url = "http://www.admin5.com/browse/26/list_".$page.".shtml";
$status = url_status($url);
if($status['content'] == 'text' && $status['state'] == 'ok'){
$files = getFileContents($url);
$contents = $files['file'];
$arr = explode($list_exp, $contents);
for($i=1;$i<count($arr);$i++){
$detail_url = "";
$detail_url = strstr($arr[$i], $url_start);
$detail_url = str_replace($url_start, "", $detail_url);
$pos = strpos($detail_url, $url_end);
$detail_url = substr($detail_url, 0, $pos);
$detail_url = $host.$detail_url;
$summary = getFileContents($detail_url);
print_r($summary);
exit;
}
}
}
?> |