PHP从一个文本字符串中提取关键字的函数
这是一个函数定位接收一个字符串作为参数(连同其他配置可选参数),并且定位该字符串中的所有关键字(出现最多的词),返回一个数组或一个字符串由逗号分隔的关键字。
01 |
/** |
02 |
* Finds all of the keywords (words that appear most) on param $str |
03 |
* and return them in order of most occurrences to less occurrences. |
04 |
* @param string $str The string to search for the keywords. |
05 |
* @param int $minWordLen[optional] The minimun length (number of chars) of a word to be considered a keyword. http://www.zzzyk.com/ |
06 |
* @param int $minWordOccurrences[optional] The minimun number of times a word has to appear |
07 |
* on param $str to be considered a keyword. |
08 |
* @param boolean $asArray[optional] Specifies if the function returns a string with the |
09 |
* keywords separated by a comma ($asArray = false) or a keywords array ($asArray = true). |
10 |
* @return mixed A string with keywords separated with commas if param $asArray is true, |
11 |
* an array with the keywords otherwise. |
12 |
*/ |
13 |
function extract_keywords( $str , $minWordLen = 3, $minWordOccurrences = 2, $asArray = false) |
14 |
{ |
15 |
function keyword_count_sort( $first , $sec ) |
16 |
{ |
17 |
return $sec [1] - $first [1]; |
18 |
} |
19 |
$str = preg_replace( '/[^\\w0-9 ]/' , ' ' , $str ); |
20 |
$str = trim(preg_replace( '/\s+/' , ' ' , $str )); |
21 |
|
22 |
$words = explode ( ' ' , $str ); |
23 |
$keywords = array (); |
24 |
while (( $c_word = array_shift ( $words )) !== null) |
25 |
{ |
26 |
if ( strlen ( $c_word ) <= $minWordLen ) continue ; |
27 |
|
28 |
$c_word = strtolower ( $c_word ); |
29 |
if ( array_key_exists ( $c_word , $keywords )) $keywords [ $c_word ][1]++; |
30 |
else $keywords [ $c_word ] = array ( $c_word , 1); |
31 |
} |
32 |
usort( $keywords , 'keyword_count_sort' ); |
33 |
|
34 |
$final_keywords = array (); |
35 |
foreach ( $keywords as $keyword_det ) |
36 |
{ |
37 |
if ( $keyword_det [1] < $minWordOccurrences ) break ; |
38 |
array_push ( $final_keywords , $keyword_det [0]); |
39 |
} |
40 |
return $asArray ? $final_keywords : implode( ', ' , $final_keywords ); |
41 |
} |
42 |
|
43 |
//How to use |
44 |
|
45 |
//Basic lorem ipsum text to extract the keywords |
46 |
$text = " |
47 |
Lorem ipsum dolor sit amet, consectetur adipiscing elit. |
48 |
Curabitur eget ipsum ut lorem laoreet porta a non libero. |
49 |
Vivamus in tortor metus. Suspendisse potenti. Curabitur |
50 |
metus nisi, adipiscing eget placerat suscipit, suscipit |
51 |
vitae felis. Integer eu odio enim, sed dignissim lorem. |
52 |
In fringilla molestie justo, vitae varius risus lacinia ac. |
53 |
Nulla port易做图 justo a lectus iaculis ut vestibulum magna |
54 |
egestas. Ut sed purus et nibh cursus fringilla at id purus. |
55 |
"; |
56 |
//Echoes: lorem, suscipit, metus, fringilla, purus, justo, eget, vitae, ipsum, curabitur, adipiscing |
57 |
echo extract_keywords( $text ); |
58 |
|