与大家分享一些php常用的采集函数,专业页面专用了
最近想给自己的网站增加个内容抓取功能,有现成函数的朋友发几个给我
最近想给自己的网站增加个内容抓取功能,有现成函数的朋友发几个给我
答案:// ####################### 获取文件流并转换成字符串 #######################
function openfile($url)
{if(file($url)){
$str = file($url);
$count = count($str);
for ($i=0;$i<$count;$i++){
$file .= $str[$i];
}return $file;
} else {die("文件打开失败!");
}
}// ####################### 切分字符串 #######################
function cut($start,$end,$file){
$content=explode($start,$file);
$content=explode($end,$content[1]);
return $content[0];
}// ####################### 清除垃圾代码 #######################
function del($start,$end,$content){
$del=cut($start,$end,$content);
$content=str_replace($del,"",$content);
$content=str_replace($start.$end,"",$content);
return $content;
}
// ####################### 分析域名 #######################function getname($url)
{
$referer = preg_replace("/https?:\/\/([^\/]+).*/i", "\\1", $url);
$referer = str_replace("www.", "", $referer);
return $referer;
}
// ####################### 清除HTML代码table #######################
function clstable($content)
{
$clscontent= preg_replace("/<table[^>]*?>.*?<\/table>/si", "", $content);
return $clscontent;
}
// ####################### 清除HTML代码script #######################function clsscript($content)
{
$clscontent= preg_replace("/<script[^>]*?>.*?<\/script>/si", "", $content);
return $clscontent;
}// ####################### 清除HTML代码div #######################
function clsdiv($content)
{
$clscontent= preg_replace("/<div[^>]*?>.*?<\/div>/si", "", $content);
return $clscontent;
}
// ####################### 清除HTML代码iframe #######################function clsifr($content)
{
$clscontent= preg_replace("/<IFRAME[^>]*?>.*?<\/IFRAME>/si", "", $content);
return $clscontent;
}
// ####################### 清除HTML代码tr,td #######################function clstrtd($content)
{
$clscontent= preg_replace("/<td[^>]*?>.*?<\/td>/si", "", $content);
$clscontent= preg_replace("/<tr[^>]*?>.*?<\/tr>/si", "", $clscontent);
$clscontent= preg_replace("/<tr[^>]*?>/si","",$clscontent);
$clscontent= preg_replace("/<td[^>]*?>/si","",$clscontent);
$clscontent= preg_replace("/<\/tr>/si","",$clscontent);
$clscontent= preg_replace("/<\/td>/si","",$clscontent);
return $clscontent;
}// ####################### 清除HTML代码超链接 #######################
function clsa($content)
{
$clscontent= preg_replace("/<a[^>]*?>.*?<\/a>/si", "", $content);
return $clscontent;
}
// ####################### 彻底清除所有HTML代码#######################
function clearhtml($content)
{
$search = array ("'<script[^>]*?>.*?</script>'si", // 去掉 javascript
"'<[\/\!]*?[^<>]*?>'si", // 去掉 HTML 标记
"'([\r\n])[\s]+'", // 去掉空白字符
"'&(quot|#34);'i", // 替换 HTML 实体
"'&(amp|#38);'i",
"'&(lt|#60);'i",
"'&(gt|#62);'i",
"'&(nbsp|#160);'i",
"'&(iexcl|#161);'i",
"'&(cent|#162);'i",
"'&(pound|#163);'i",
"'&(copy|#169);'i",
"'&#(\d+);'e"); // 作为 PHP 代码运行$replace = array ("",
"",
"\\1",
"\"",
"&",
"<",
">",
" ",
chr(161),
chr(162),
chr(163),
chr(169),
"chr(\\1)");$text = preg_replace ($search, $replace, $content);
return $text;
}// ####################### 写入缓存文件 #######################
function writetocache($cachedir,$cachename, $cachedata = '') {$cachedir = './'.$cachedir.'/';
$cachefile = $cachedir.$cachename.'.php';
if(!is_dir($cachedir)) {
@mkdir($cachedir, 0777);
}
if(!is_dir($cachedir)) {
@mkdir($cachedir, 0777);
}
if(@$fp = fopen($cachefile, 'wb')) {
@fwrite($fp, $cachedata);
 
上一个:怎么查php脚本速度慢的原因?
下一个:PHP多数数组循环方面的问题请教