首页>>技术分享>>php杂谈>php curl爬取和xpath筛选

php curl爬取和xpath筛选

大路 php杂谈 2023-10-14 158
<?php



	//header("Content-type:text/html;charset=utf-8");

	



	// $baseurl = "https://avcarhjw765.vip/";

	// $url = "https://avcarhjw765.vip/forum.php?mod=viewthread&tid=82161&extra=page%3D30";

	// 

	// $html = curlget($url);

	// $regular = '//*[@class="t_f"]';

	// $content = xpathregular($html,$regular);

	// foreach($content as $k=>$v)

	// {

	// 	print_r($v->nodeValue);

	// 	break;

	// }

	// 

	// $titleregular = '//*[@id="thread_subject"]';

	// $title = xpathregular($html,$titleregular);

	// foreach($title as $k=>$v)

	// {

	// 	print_r($v->textContent);

	// }

	// 

	// die;











	//header("Content-type:text/html;charset=gb18030");

	$start = 1;

	$end = 30;



	$baseurl = "https://avcarhjw765.vip/";

	$findurl = "https://avcarhjw765.vip/forum.php?mod=forumdisplay&fid=86&page=";

	

	

	

	

	for(;$start<=$end;$start++)

	{

		$url = $findurl.$start;

		$html = curlget($url);

		$regular = "//*[@id='threadlisttableid']/tbody/tr/th/a/@href";

		$contentlist = xpathregular($html,$regular);

		$contentdetail = '';

		$contentitle  = '';

		foreach($contentlist as $k=>$v)

		{

			

			if(!empty($v))

			{

				if($v->textContent !='javascript:;'&&$v->textContent !='javascript:void(0);')

				{

					

					$detailurl = $baseurl.$v->textContent;

					$crawl = $v->textContent;

					$html = curlget($detailurl);

					$regular = '//*[@class="t_f"]';

					$content = xpathregular($html,$regular);

					

					//print_r($content);

					foreach($content as $k=>$v)

					{

						$contentdetail = $v->nodeValue;

						break;

					}

					

					$titleregular = '//*[@id="thread_subject"]';

					$title = xpathregular($html,$titleregular);

					foreach($title as $k=>$v)

					{

						$contentitle = $v->textContent;

					}

					$query = "insert into lianwu(title,content,url)values('$contentitle','$contentdetail','$crawl')";

					//file_put_contents('1.txt',$query);

					//echo $query;

					db($query);

					echo 'success';

					echo "\r\n";

					//die;

				}

				// @print_r($v->textContent);

			}

			

		}		

	}

	

	function db($query)

	{

		$con=mysqli_connect("localhost","root","root","demo"); 

		if (mysqli_connect_errno($con)) 

		{ 

			echo "连接 MySQL 失败: " . mysqli_connect_error(); 

		} 

		mysqli_set_charset($con,"utf8");

		// 执行查询

		// mysqli_query($con,"SELECT * FROM websites");

		mysqli_query($con,$query);

		 

		mysqli_close($con);		

	}	



	function xpathregular($html,$regular)

	{

		$dom = new DOMDocument();

		 

		//从一个字符串加载HTML

	 

		@$dom->loadHTML($html);

		// print_r($dom);

		// die;			 

		//使该HTML规范化



		$dom->normalize();

		 

		//用DOMXpath加载DOM,用于查询

		 

		$xpath = new DOMXPath($dom);

		#获取所有的a标签的地址

		$results = $xpath->query($regular);

		// for ($i = 0; $i < $results->length; $i++) 

		// {

		//  

		// 	$result = $results->item($i);

		// 	 

		// 	$linktext = $result->nodeValue;

		// 	 

		// 	echo $linktext;

		// 	echo "<br/>";

		// }			 

		 

		return $results;

	}	

	function curlget($url)

	{

		// $url = "https://so.gushiwen.org/shiwenv_94e9aad7a0d2.aspx";



		$ch = curl_init(); // 2. 设置选项,包括URL

		curl_setopt($ch, CURLOPT_TIMEOUT, 5); //超时

		curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36');

		curl_setopt($ch, CURLOPT_COOKIEFILE, dirname(__FILE__)."/tmp.cookie");

		curl_setopt($ch, CURLOPT_COOKIEJAR, dirname(__FILE__)."/tmp.cookie");		

		curl_setopt($ch,CURLOPT_URL,$url);

		//curl_setopt($ch,CURLOPT_SSL_VERIFYPEER,true); 

		

		curl_setopt($ch,CURLOPT_SSL_VERIFYPEER,0);

		  

		curl_setopt($ch,CURLOPT_SSL_VERIFYHOST,0);

		curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);



		curl_setopt($ch,CURLOPT_HEADER,0); // 3. 执行并获取HTML文档内容



		$output = curl_exec($ch); 

		if($output === FALSE )

		{ 

			echo "CURL Error:".curl_error($ch);



		} // 4. 释放curl句柄

		// echo $output;

		return $output;

		curl_close($ch);

		//die;

		



	}

?>


标签: