PHP curl() 获取网页内容

技术笔记 - PHP - PHP类库

发布时间:2020-02-03 14:43:55

admin 于  2020-02-03 14:43:55 编辑

/**$html = curl_get_file_contents($url);
$title = get_title_contents($html);
var_dump($title);*/
function curl_get_file_contents($url,$referer='') {
	static $curl_loops = 0;
	//避免死了循环必备
	static $curl_max_loops = 3;
	$useragent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36";
	$ch = curl_init();
	curl_setopt($ch,CURLOPT_URL,$url);
	curl_setopt($ch,CURLOPT_HEADER,true);
	curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
	//不验证证书
	curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
	//不验证证书
	curl_setopt($ch,CURLOPT_USERAGENT,$useragent);
	curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
	curl_setopt($ch,CURLOPT_REFERER,$referer);
	$data = curl_exec($ch);
	$ret = $data;
	list($header,$data) = explode("\r\n\r\n",$data,2);
	$http_code = curl_getinfo($ch,CURLINFO_HTTP_CODE);
	$last_url = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL);
	curl_close($ch);
	if ($http_code == 301 || $http_code == 302) {
		$matches = array();
		preg_match('/Location:(.*?)\n/',$header,$matches);
		$url = @parse_url(trim(array_pop($matches)));
		if (!$url) {
			return $data;
		}
		$new_url = $url['scheme'] . '://' . $url['host'] . $url['path'] . (isset($url['query']) ? '?' . $url['query'] : '');
		if ($curl_loops++ >= $curl_max_loops) {
			return false;
		} else {
			$new_url = stripslashes($new_url);
			return curl_get_file_contents($new_url);
		}
	} else {
		list($header,$data) = explode("\r\n\r\n",$ret,2);
		return $data;
	}
}
function get_title_contents($html) {
	// 解析 HTML 的 <*head> 区段
	//  <*meta http-equiv="Content-type" content="text/html; charset=utf-8" />
	//  <*meta content="text/html; charset=gb2312" http-equiv="Content-Type">
	preg_match("/<*head.*>(.*)<\/head>/smUi",$html, $htmlHeaders);
	//var_dump($output);die();
	if(!count($htmlHeaders)) {
		$title = "无法解析数据中的 <*head> 区段";
	}
	// 取得 <*head> 中 meta 设置的编码格式<*meta charset="gb2312">
	if(preg_match('/<*meta.*charset=(("){0,1}[a-zA-Z0-9-]*("){0,1})/',$htmlHeaders[1], $results)) {
		$charset =  $results[1];
	} else {
		$charset = "None";
	}
	$charset = str_replace('"','',$charset);
	// 取得 <*title> 中的文字
	if(preg_match("/<*title>(.*)<\/title>/Ui",$htmlHeaders[1], $htmlTitles)) {
		if(!count($htmlTitles)) {
			$title = "无法解析 <*title> 的内容";
			exit;
		}
		// 将  <*title> 的文字编码格式转成 UTF-8
		if($charset == "None") {
			$title=$htmlTitles[1];
		} else {
			$title=iconv($charset, "UTF-8", $htmlTitles[1]);
		}
	}
	return html_entity_decode($title);
}

转载声明:本站文章无特别说明,皆为原创,版权所有,转载请注明:Dy大叔的日常

转载自 PHP curl() 获取网页内容 | XDY.ME@Dy大叔的日常

XDY.ME@Dy大叔的日常