当前位置:编程问答 > php >

与大家分享一些php常用的采集函数,专业页面专用了

问题:最近想给自己的网站增加个内容抓取功能,有现成函数的朋友发几个给我
答案:

// ####################### 获取文件流并转换成字符串 #######################
function openfile($url)
{

if(file($url)){
        $str = file($url);
        $count = count($str);
        for ($i=0;$i<$count;$i++){
        $file .= $str[$i];
        }

      return $file;
       } else {

       die("文件打开失败!");

       }
}

// ####################### 切分字符串 #######################
function cut($start,$end,$file){
        $content=explode($start,$file);
        $content=explode($end,$content[1]);
        return  $content[0];
}

// ####################### 清除垃圾代码 #######################
function del($start,$end,$content){
       $del=cut($start,$end,$content);
       $content=str_replace($del,"",$content);
       $content=str_replace($start.$end,"",$content);
       return $content;
    
   
}
// ####################### 分析域名 #######################

function getname($url)
{
$referer = preg_replace("/https?:\/\/([^\/]+).*/i", "\\1", $url);
$referer = str_replace("www.", "", $referer);
return $referer;
}


// ####################### 清除HTML代码table #######################
function clstable($content)
{
$clscontent= preg_replace("/<table[^>]*?>.*?<\/table>/si", "", $content);
return $clscontent;
}
// ####################### 清除HTML代码script #######################

function clsscript($content)
{
$clscontent= preg_replace("/<script[^>]*?>.*?<\/script>/si", "", $content);
return $clscontent;
}

// ####################### 清除HTML代码div #######################

function clsdiv($content)
{
$clscontent= preg_replace("/<div[^>]*?>.*?<\/div>/si", "", $content);
return $clscontent;
}


// ####################### 清除HTML代码iframe #######################

function clsifr($content)
{
$clscontent= preg_replace("/<IFRAME[^>]*?>.*?<\/IFRAME>/si", "", $content);
return $clscontent;
}


// ####################### 清除HTML代码tr,td #######################

function clstrtd($content)
{
$clscontent= preg_replace("/<td[^>]*?>.*?<\/td>/si", "", $content);
$clscontent= preg_replace("/<tr[^>]*?>.*?<\/tr>/si", "", $clscontent);
$clscontent= preg_replace("/<tr[^>]*?>/si","",$clscontent);
$clscontent= preg_replace("/<td[^>]*?>/si","",$clscontent);
$clscontent= preg_replace("/<\/tr>/si","",$clscontent);
$clscontent= preg_replace("/<\/td>/si","",$clscontent);
return $clscontent;
}

// ####################### 清除HTML代码超链接 #######################

function clsa($content)
{
$clscontent= preg_replace("/<a[^>]*?>.*?<\/a>/si", "", $content);
return $clscontent;
}
// ####################### 彻底清除所有HTML代码#######################
function clearhtml($content)
{
$search = array ("'<script[^>]*?>.*?</script>'si",  // 去掉 javascript
                 "'<[\/\!]*?[^<>]*?>'si",           // 去掉 HTML 标记
                 "'([\r\n])[\s]+'",                 // 去掉空白字符
                 "'&(quot|#34);'i",                 // 替换 HTML 实体
                 "'&(amp|#38);'i",
                 "'&(lt|#60);'i",
                 "'&(gt|#62);'i",
                 "'&(nbsp|#160);'i",
                 "'&(iexcl|#161);'i",
                 "'&(cent|#162);'i",
                 "'&(pound|#163);'i",
                 "'&(copy|#169);'i",
                 "'&#(\d+);'e");                    // 作为 PHP 代码运行

$replace = array ("",
                  "",
                  "\\1",
                  "\"",
                  "&",
                  "<",
                  ">",
                  " ",
                  chr(161),
                  chr(162),
                  chr(163),
                  chr(169),
                  "chr(\\1)");

$text = preg_replace ($search, $replace, $content);
return $text;
}

// ####################### 写入缓存文件 #######################
function writetocache($cachedir,$cachename, $cachedata = '') {

    $cachedir = './'.$cachedir.'/';
    $cachefile = $cachedir.$cachename.'.php';
    if(!is_dir($cachedir)) {
                        @mkdir($cachedir, 0777);
          }
    if(!is_dir($cachedir)) {
      @mkdir($cachedir, 0777);
    }
    if(@$fp = fopen($cachefile, 'wb')) {
      @fwrite($fp, $cachedata);
      @fclose($fp);
      @chmod($cachefile, 0777);
    } else {
      echo 'Can not write to cache files, please check directory ./cache/ .';
      exit;
    }

}

// ####################### 获取文件里的html链接 #######################
function geturl($re,$ufile,$rep1,$rep2){

preg_match_all ($re,
    $ufile,
    $out, PREG_PATTERN_ORDER);

$result=count($out[1]);
$i=0;
while($i<$result)
{
$outs[$i]=str_replace($rep1,$rep2,$out[1][$i]);

$i++;
}
//合并相同的链接并重新索引...
$reout=array();
$reout=resetar($outs);
return $reout;
}

上一个:怎么查php脚本速度慢的原因?
下一个:PHP多数数组循环方面的问题请教

CopyRight © 2012 站长网 编程知识问答 www.zzzyk.com All Rights Reserved
部份技术文章来自网络,