PHP文章采集URL补全函数

2024-05-04 21:53:01

字体：大中小

来源：转载

供稿：网友

写采集必用的函数，URL补全函数，也可叫做FormatUrl。

写此函数作用就是为了开发采集程序，采集文章的时候会经常遇到页面里的路径是 “相对路径” 或者 “绝对根路径” 不是“绝对全路径”就无法收集URL。

所以，就需要本功能函数进行对代码进行格式化，把所有的超链接都格式化一遍，这样就可以直接收集到正确的URL了。

路径知识普及

相对路径：“../” “./” 或者前面什么都不加

绝对根路径：/path/xxx.html

绝对全路径：http://www.xxx.com/path/xxx.html

使用实例：

<?php 
$surl="http://www.soqi.cc/"; 
$gethtm = '<a href="/">首页</a><a href="/daxue/">大学排行</a>'; 
echo formaturl($gethtm,$surl); 
?>

输出：

<a href=http://www.soqi.cc>首页</a><a href=http://www.soqi.cc/daxue/>大学排行</a>

函数代码如下：

<?php    
 function formaturl($l1, $l2) {    
    if (preg_match_all ( "/(<img[^>]+src=/"([^/"]+)/"[^>]*>)|(<a[^>]+href=/"([^/"]+)/"[^>]*>)|(<img[^>]+src='([^']+)'[^>]*>)|(<a[^>]+href='([^']+)'[^>]*>)/i", $l1, $regs )) {    
        foreach ( $regs [0] as $num => $url ) {    
            $l1 = str_replace ( $url, lIIIIl ( $url, $l2 ), $l1 );    
        }    
    }    
    return $l1;    
}    
 function lIIIIl($l1, $l2) {    
    if (preg_match ( "/(.*)(href|src)/=(.+?)( |///>|/>).*/i", $l1, $regs )) {    
        $I2 = $regs [3];    
    }    
    if (strlen ( $I2 ) > 0) {    
        $I1 = str_replace ( chr ( 34 ), "", $I2 );    
        $I1 = str_replace ( chr ( 39 ), "", $I1 );    
    } else {    
        return $l1;    
    }    
    $url_parsed = parse_url ( $l2 );    
    $scheme = $url_parsed ["scheme"];    
    if ($scheme != "") {    
        $scheme = $scheme . "://";    
    }    
    $host = $url_parsed ["host"];    
    $l3 = $scheme . $host;    
    if (strlen ( $l3 ) == 0) {    
        return $l1;    
    }    
    $path = dirname ( $url_parsed ["path"] );    
    if ($path [0] == "//") {  
        $path = "";  
    }  
    $pos = strpos ( $I1, "#" );  
    if ($pos > 0)  
        $I1 = substr ( $I1, 0, $pos );  
      
        //判断类型  
    if (preg_match ( "/^(http|https|ftp):(////|////)(([/w/////+/-~`@:%])+/.)+([/w/////./=/?/+/-~`@/':!%#]|(&amp;)|&)+/i", $I1 )) {  
        return $l1;  
    } //http开头的url类型要跳过  
elseif ($I1 [0] == "/") {  
        $I1 = $l3 . $I1;  
    } //绝对路径  
elseif (substr ( $I1, 0, 3 ) == "../") { //相对路径  
        while ( substr ( $I1, 0, 3 ) == "../" ) {  
            $I1 = substr ( $I1, strlen ( $I1 ) - (strlen ( $I1 ) - 3), strlen ( $I1 ) - 3 );  
            if (strlen ( $path ) > 0) {  
                $path = dirname ( $path );  
            }  
        }  
        $I1 = $l3 . $path . "/" . $I1;  
    } elseif (substr ( $I1, 0, 2 ) == "./") {  
        $I1 = $l3 . $path . substr ( $I1, strlen ( $I1 ) - (strlen ( $I1 ) - 1), strlen ( $I1 ) - 1 );  
    } elseif (strtolower ( substr ( $I1, 0, 7 ) ) == "mailto:" || strtolower ( substr ( $I1, 0, 11 ) ) == "javascript:") {  
        return $l1;  
    } else {  
        $I1 = $l3 . $path . "/" . $I1;  
    }  
    return str_replace ( $I2, "/"$I1/"", $l1 );    
}    
?>