求大牛看一下我字符串搜索算法的问题吧(BF,BM,KMP三种对比)
用了BF,BM,KMP算法,发现跑出来的结果跟想象的不一样啊,开始BM都没有BF快,到最后,BM也只跟BF比较持平,求大神拍砖哈:)注:该算法常用语在一个xml文本中查找一个xml标签,所以子串里面ascii码会占绝大多数
################################华丽丽的分割线##############################################
import java.util.HashMap;
import java.util.Random;
public class Test {
/**
* 在一个char数组中寻找一个string首次出现的位置,与String.indexOf输入及输出相同
*
* @param str
* @param sub
* @param from
* @return
*/
public static final int indexOf(char[] str, CharSequence sub, int from) {
if (str == null
|| sub == null)
return -1;
if (from < 0)
return -1;
int len = str.length;
if (from >= len)
return -1;
int sublen = sub.length();
if (sublen == 0)
return -1;
if (from + sublen > len)
return -1;
for (int i = from; i <= len - sublen; i++) {
int l = i;
int r = 0;
while (r < sublen
&& (str[l++] == sub.charAt(r))) {
r++;
}
if (r == sublen) {
return i;
}
}
return -1;
}
public static int bm(char[] str, CharSequence sub, int from){
if (str == null
|| sub == null)
return -1;
if (from < 0)
return -1;
int n = str.length, m = sub.length(),
i = 0, j = 0, k = 0;
if (m == 0)
return -1;
if (from + m > n)
return -1;
/**
* Pre-compute bmBc
*/
HashMap<Character, Integer> bmBc = new HashMap<Character, Integer>();
for( i = 0 ; i < m ; i ++ ){
bmBc.put(sub.charAt(i), m - 1 - i );
}
int[] bmbc0 = new int[128], bmbc1 = new int[ m * 2 ];
char[] c_bmbc = new char[ m * 2 ];
for( i = 0 ; i < m ; i ++ ){
bmbc0[i] = 0;
}
for( i = 0 ; i < m * 2 ; i ++ ){
bmbc1[i] = 0;
c_bmbc[i] = 0;
}
for( i = 0 ; i < m ; i ++ ){
// saveBmbc( bmbc0, bmbc1, c_bmbc, sub.charAt(i), m - 1 - i );
char c = sub.charAt(i);
if( c < 128 ){
bmbc0[c] = m - 1 -i;
} else {
int l = bmbc1.length;
int p = c % l;
for( j = 0 ; j<l && (bmbc1[p] != 0 || c_bmbc[p] == c) ; j ++ ){
p = (p+1) % l;
}
bmbc1[p] = m - 1 - i;
}
}
/**
* Pre-compute bmGs
* 当坏字符出现在Pattern中时,该字符(位置相对P为i)移动到和坏字符(位置相对P为m-1)重合需要的距离,即m-1-i
* 实际计算移动时,坏字符的移动距离为 bmBc['k'] - (m-1-q)
*/
// compute suff
int[] suff = new int[m];
suff[m-1] = m;
for( i = m - 2 ; i >= 0 ; i -- ){
int q = i;
for( ; q >=0 && sub.charAt(q) == sub.charAt(m-1-(i-q)) ; q --);
suff[i] = i - q;
}
// compute bmGs
// case 1: 没有和后缀匹配的前缀
int[] bmGs = new int[m];
for( i = 0 ; i <= m-1 ; i ++ ){
bmGs[i] = m ;
}
// case 2 : 有前缀和后缀匹配的情况
for( i = m - 2 ; i >= 0 ; i -- ){
if( suff[i] == i + 1 ){
for( ; j < m - 1 - i ; j ++ ){
bmGs[j] = m - 1 - i;
}
}
}
// case 3 : 有子串和后缀匹配
for( i = 0 ; i < m-1 ; i ++){
bmGs[ m - 1 - suff[i] ] = m - 1 - i ;
}
/**
* Match
*/
j = from;
int q = 0;
while( j <= n - m){
q = m - 1;
for(; q >= 0 && sub.charAt(q) == str[q+j]; q--);
if( q < 0 )
return j;
else{
// Integer b = bmBc.get( sub.charAt(q) );
//int b = findBmbc(bmbc0, bmbc1, c_bmbc, sub.charAt(q));
char c = sub.charAt(q);
int b = -1;
if( c < 128 ){
b = bmbc0[c];
} else {
int l = bmbc1.length;
int p = c % l;
for( k = 0 ; k < l && c_bmbc[p] != c ; k ++ ){
p = ( p + 1 ) % l ;
}
if( k == l) b = -1;
b = p;
}
i = b==-1? q + 1 : b - ( m - 1 - q) ;
j += bmGs[q] < i ? i : bmGs[q];
}
}
return -1;
}
public static int kmp(char[] str, CharSequence sub, int from){
if (str == null
|| sub == null)
return -1;
if (from < 0)
return -1;
int n = str.length, m = sub.length(), i = 0;
if (m == 0)
return -1;
if (from + m > n)
return -1;
/**
* Pre-compute next
*/
int[] next = new int[m];
int k = -1;
next[0] = -1;
for( i = 0 ; i < m - 1 ; ){
if( k == -1 || sub.charAt(k) == sub.charAt(i)){
next[i+1] = k + 1;
i ++;
k ++;
}else {
k = next[k];
}
}
/**
* Match
*/
i = from;
k = -1;
int j = 0;
while( i < n){
if( j == -1 || str[i] == sub.charAt(j) )
{
i++;
j++;
} else if( j == 0){
i++;
} else{
j = next[j];
}
if( j == m)
return i - m;
}
return -1;
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
// 构造测试数据
Random rand = new Random();
char[] carr = "<>/_-:0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray();
int n = 100,
tl = 1000000;
int[] pl = new int[]{3, 6, 8, 10, 15, 20, 25, 30, 50, 80, 100};
char[][] T = new char[n][];
String[] P = new String[n];
for( int ii = 0 ; ii < pl.length ; ii ++ ){
for( int i = 0 ; i < n ; i ++ ){
T[i] = new char[tl];
for( int j = 0 ; j < tl ; j ++){
T[i][j] = carr[rand.nextInt(carr.length)];
}
P[i] = "";
for( int j = 0 ; j < pl[ii] ; j ++){
P[i] += carr[rand.nextInt(carr.length)];;
}
}
// 跑数据
long start=0, end =0;
System.out.println("n: " + n + ", text-len: " + tl + ", pattern-len: " + pl[ii]);
start = System.currentTimeMillis();
for( int i = 0 ; i < n ; i ++ ){
int ret1 = indexOf(T[i], P[i], 0);
// int ret2 = bm(T[i], P[i], 0);
// int ret3 = kmp(T[i], P[i], 0);
// if( ret1 != ret2)
// System.out.println("...");
// if( ret1 != ret3)
// System.out.println("333");
}
end = System.currentTimeMillis();
System.out.println("BF: " + (end - start) );
start = System.currentTimeMillis();
for( int i = 0 ; i < n ; i ++ ){
bm(T[i], P[i], 0);
}
end = System.currentTimeMillis();
System.out.println("BM: " + (end - start) );
start = System.currentTimeMillis();
for( int i = 0 ; i < n ; i ++ ){
kmp(T[i], P[i], 0);
}
end = System.currentTimeMillis();
System.out.println("KMP: " + (end - start) );
}
}
} --------------------编程问答-------------------- Run之后控制台的数据:
n: 100, text-len: 1000000, pattern-len: 3
BF: 87
BM: 130
KMP: 95
n: 100, text-len: 1000000, pattern-len: 6
BF: 336
BM: 493
KMP: 304
n: 100, text-len: 1000000, pattern-len: 8
BF: 335
BM: 490
KMP: 304
n: 100, text-len: 1000000, pattern-len: 10
BF: 337
BM: 479
KMP: 302
n: 100, text-len: 1000000, pattern-len: 15
BF: 334
BM: 453
KMP: 303
n: 100, text-len: 1000000, pattern-len: 20
BF: 335
BM: 430
KMP: 302
n: 100, text-len: 1000000, pattern-len: 25
BF: 336
BM: 413
KMP: 303
n: 100, text-len: 1000000, pattern-len: 30
BF: 336
BM: 404
KMP: 302
n: 100, text-len: 1000000, pattern-len: 50
BF: 337
BM: 357
KMP: 302
n: 100, text-len: 1000000, pattern-len: 80
BF: 335
BM: 351
KMP: 304
n: 100, text-len: 1000000, pattern-len: 100
BF: 336
BM: 335
KMP: 301
--------------------编程问答-------------------- 自己顶。。。。 --------------------编程问答-------------------- 再顶一下....
补充:Java , Java相关