<?php
//header("Content-type:text/html;charset=utf-8");
// $baseurl = "https://avcarhjw765.vip/";
// $url = "https://avcarhjw765.vip/forum.php?mod=viewthread&tid=82161&extra=page%3D30";
//
// $html = curlget($url);
// $regular = '//*[@class="t_f"]';
// $content = xpathregular($html,$regular);
// foreach($content as $k=>$v)
// {
// print_r($v->nodeValue);
// break;
// }
//
// $titleregular = '//*[@id="thread_subject"]';
// $title = xpathregular($html,$titleregular);
// foreach($title as $k=>$v)
// {
// print_r($v->textContent);
// }
//
// die;
//header("Content-type:text/html;charset=gb18030");
$start = 1;
$end = 30;
$baseurl = "https://avcarhjw765.vip/";
$findurl = "https://avcarhjw765.vip/forum.php?mod=forumdisplay&fid=86&page=";
for(;$start<=$end;$start++)
{
$url = $findurl.$start;
$html = curlget($url);
$regular = "//*[@id='threadlisttableid']/tbody/tr/th/a/@href";
$contentlist = xpathregular($html,$regular);
$contentdetail = '';
$contentitle = '';
foreach($contentlist as $k=>$v)
{
if(!empty($v))
{
if($v->textContent !='javascript:;'&&$v->textContent !='javascript:void(0);')
{
$detailurl = $baseurl.$v->textContent;
$crawl = $v->textContent;
$html = curlget($detailurl);
$regular = '//*[@class="t_f"]';
$content = xpathregular($html,$regular);
//print_r($content);
foreach($content as $k=>$v)
{
$contentdetail = $v->nodeValue;
break;
}
$titleregular = '//*[@id="thread_subject"]';
$title = xpathregular($html,$titleregular);
foreach($title as $k=>$v)
{
$contentitle = $v->textContent;
}
$query = "insert into lianwu(title,content,url)values('$contentitle','$contentdetail','$crawl')";
//file_put_contents('1.txt',$query);
//echo $query;
db($query);
echo 'success';
echo "\r\n";
//die;
}
// @print_r($v->textContent);
}
}
}
function db($query)
{
$con=mysqli_connect("localhost","root","root","demo");
if (mysqli_connect_errno($con))
{
echo "连接 MySQL 失败: " . mysqli_connect_error();
}
mysqli_set_charset($con,"utf8");
// 执行查询
// mysqli_query($con,"SELECT * FROM websites");
mysqli_query($con,$query);
mysqli_close($con);
}
function xpathregular($html,$regular)
{
$dom = new DOMDocument();
//从一个字符串加载HTML
@$dom->loadHTML($html);
// print_r($dom);
// die;
//使该HTML规范化
$dom->normalize();
//用DOMXpath加载DOM,用于查询
$xpath = new DOMXPath($dom);
#获取所有的a标签的地址
$results = $xpath->query($regular);
// for ($i = 0; $i < $results->length; $i++)
// {
//
// $result = $results->item($i);
//
// $linktext = $result->nodeValue;
//
// echo $linktext;
// echo "<br/>";
// }
return $results;
}
function curlget($url)
{
// $url = "https://so.gushiwen.org/shiwenv_94e9aad7a0d2.aspx";
$ch = curl_init(); // 2. 设置选项,包括URL
curl_setopt($ch, CURLOPT_TIMEOUT, 5); //超时
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36');
curl_setopt($ch, CURLOPT_COOKIEFILE, dirname(__FILE__)."/tmp.cookie");
curl_setopt($ch, CURLOPT_COOKIEJAR, dirname(__FILE__)."/tmp.cookie");
curl_setopt($ch,CURLOPT_URL,$url);
//curl_setopt($ch,CURLOPT_SSL_VERIFYPEER,true);
curl_setopt($ch,CURLOPT_SSL_VERIFYPEER,0);
curl_setopt($ch,CURLOPT_SSL_VERIFYHOST,0);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch,CURLOPT_HEADER,0); // 3. 执行并获取HTML文档内容
$output = curl_exec($ch);
if($output === FALSE )
{
echo "CURL Error:".curl_error($ch);
} // 4. 释放curl句柄
// echo $output;
return $output;
curl_close($ch);
//die;
}
?>
© 版权声明
文章版权归作者所有,未经允许请勿转载。
THE END