php curl爬取和xpath筛选

<?php

	//header("Content-type:text/html;charset=utf-8");
	

	// $baseurl = "https://avcarhjw765.vip/";
	// $url = "https://avcarhjw765.vip/forum.php?mod=viewthread&tid=82161&extra=page%3D30";
	// 
	// $html = curlget($url);
	// $regular = '//*[@class="t_f"]';
	// $content = xpathregular($html,$regular);
	// foreach($content as $k=>$v)
	// {
	// 	print_r($v->nodeValue);
	// 	break;
	// }
	// 
	// $titleregular = '//*[@id="thread_subject"]';
	// $title = xpathregular($html,$titleregular);
	// foreach($title as $k=>$v)
	// {
	// 	print_r($v->textContent);
	// }
	// 
	// die;





	//header("Content-type:text/html;charset=gb18030");
	$start = 1;
	$end = 30;

	$baseurl = "https://avcarhjw765.vip/";
	$findurl = "https://avcarhjw765.vip/forum.php?mod=forumdisplay&fid=86&page=";
	
	
	
	
	for(;$start<=$end;$start++)
	{
		$url = $findurl.$start;
		$html = curlget($url);
		$regular = "//*[@id='threadlisttableid']/tbody/tr/th/a/@href";
		$contentlist = xpathregular($html,$regular);
		$contentdetail = '';
		$contentitle  = '';
		foreach($contentlist as $k=>$v)
		{
			
			if(!empty($v))
			{
				if($v->textContent !='javascript:;'&&$v->textContent !='javascript:void(0);')
				{
					
					$detailurl = $baseurl.$v->textContent;
					$crawl = $v->textContent;
					$html = curlget($detailurl);
					$regular = '//*[@class="t_f"]';
					$content = xpathregular($html,$regular);
					
					//print_r($content);
					foreach($content as $k=>$v)
					{
						$contentdetail = $v->nodeValue;
						break;
					}
					
					$titleregular = '//*[@id="thread_subject"]';
					$title = xpathregular($html,$titleregular);
					foreach($title as $k=>$v)
					{
						$contentitle = $v->textContent;
					}
					$query = "insert into lianwu(title,content,url)values('$contentitle','$contentdetail','$crawl')";
					//file_put_contents('1.txt',$query);
					//echo $query;
					db($query);
					echo 'success';
					echo "\r\n";
					//die;
				}
				// @print_r($v->textContent);
			}
			
		}		
	}
	
	function db($query)
	{
		$con=mysqli_connect("localhost","root","root","demo"); 
		if (mysqli_connect_errno($con)) 
		{ 
			echo "连接 MySQL 失败: " . mysqli_connect_error(); 
		} 
		mysqli_set_charset($con,"utf8");
		// 执行查询
		// mysqli_query($con,"SELECT * FROM websites");
		mysqli_query($con,$query);
		 
		mysqli_close($con);		
	}	

	function xpathregular($html,$regular)
	{
		$dom = new DOMDocument();
		 
		//从一个字符串加载HTML
	 
		@$dom->loadHTML($html);
		// print_r($dom);
		// die;			 
		//使该HTML规范化

		$dom->normalize();
		 
		//用DOMXpath加载DOM,用于查询
		 
		$xpath = new DOMXPath($dom);
		#获取所有的a标签的地址
		$results = $xpath->query($regular);
		// for ($i = 0; $i < $results->length; $i++) 
		// {
		//  
		// 	$result = $results->item($i);
		// 	 
		// 	$linktext = $result->nodeValue;
		// 	 
		// 	echo $linktext;
		// 	echo "<br/>";
		// }			 
		 
		return $results;
	}	
	function curlget($url)
	{
		// $url = "https://so.gushiwen.org/shiwenv_94e9aad7a0d2.aspx";

		$ch = curl_init(); // 2. 设置选项,包括URL
		curl_setopt($ch, CURLOPT_TIMEOUT, 5); //超时
		curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36');
		curl_setopt($ch, CURLOPT_COOKIEFILE, dirname(__FILE__)."/tmp.cookie");
		curl_setopt($ch, CURLOPT_COOKIEJAR, dirname(__FILE__)."/tmp.cookie");		
		curl_setopt($ch,CURLOPT_URL,$url);
		//curl_setopt($ch,CURLOPT_SSL_VERIFYPEER,true); 
		
		curl_setopt($ch,CURLOPT_SSL_VERIFYPEER,0);
		  
		curl_setopt($ch,CURLOPT_SSL_VERIFYHOST,0);
		curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);

		curl_setopt($ch,CURLOPT_HEADER,0); // 3. 执行并获取HTML文档内容

		$output = curl_exec($ch); 
		if($output === FALSE )
		{ 
			echo "CURL Error:".curl_error($ch);

		} // 4. 释放curl句柄
		// echo $output;
		return $output;
		curl_close($ch);
		//die;
		

	}
?>
© 版权声明
THE END
喜欢就支持一下吧
点赞15 分享
评论 抢沙发
头像
欢迎您留下宝贵的见解!
提交
头像

昵称

取消
昵称表情代码图片