算法擂台——比较字符串
--------------------编程问答-------------------- --------------------编程问答-------------------- 这个很难得啊对那个相似度完全没概念
允许谷歌吗? --------------------编程问答-------------------- 好像牵涉到人工智能,模糊数学之类的东西了,太难。
不过精通这些技术,以后可以识别验证码,识别人脸,等等。 --------------------编程问答-------------------- --------------------编程问答-------------------- 不会,期待提示。。。。。。。。。 --------------------编程问答-------------------- 有点像?A?B --------------------编程问答-------------------- 其他3个我可以认为有比较长的子序列者 比较忧
但是最后一个就不符合了.
和 abcdefg 匹配: adcdefg 要优于 accdefg --------------------编程问答-------------------- 先贴结果:
--------------------编程问答-------------------- 计算两个字符串匹配过程中的两类错误:插入错误、删除错误
a => an 差异度:1
a => as 差异度:1
a => at 差异度:1
again => gain 差异度:1
all => call 差异度:1
all => fall 差异度:1
all => hall 差异度:1
all => ll 差异度:1
allow => allows 差异度:1
an => and 差异度:1
an => any 差异度:1
an => can 差异度:1
any => many 差异度:1
api => apis 差异度:1
appear => appears 差异度:1
application => applications 差异度:1
are => area 差异度:1
as => has 差异度:1
associate => associated 差异度:1
bar => bars 差异度:1
base => based 差异度:1
become => becomes 差异度:1
begin => begins 差异度:1
benefit => benefits 差异度:1
bit => bite 差异度:1
bit => bits 差异度:1
bit => it 差异度:1
bits => its 差异度:1
book => books 差异度:1
button => buttons 差异度:1
c => cd 差异度:1
call => calls 差异度:1
change => changed 差异度:1
change => changes 差异度:1
chapter => chapters 差异度:1
command => commands 差异度:1
compile => compiler 差异度:1
compiler => compilers 差异度:1
computer => computers 差异度:1
concept => concepts 差异度:1
contain => contains 差异度:1
create => creates 差异度:1
define => defined 差异度:1
define => defines 差异度:1
developer => developers 差异度:1
device => devices 差异度:1
dictate => dictates 差异度:1
did => didn 差异度:1
disk => disks 差异度:1
display => displays 差异度:1
dll => dlls 差异度:1
dll => ll 差异度:1
do => don 差异度:1
do => dos 差异度:1
does => doesn 差异度:1
does => dos 差异度:1
don => done 差异度:1
download => downloads 差异度:1
driver => drivers 差异度:1
edition => editions 差异度:1
emerged => merged 差异度:1
ems => ms 差异度:1
environment => environments 差异度:1
executable => executables 差异度:1
file => files 差异度:1
fit => it 差异度:1
font => fonts 差异度:1
for => form 差异度:1
function => functions 差异度:1
give => given 差异度:1
give => gives 差异度:1
gui => guis 差异度:1
help => helps 差异度:1
i => if 差异度:1
i => in 差异度:1
i => is 差异度:1
i => it 差异度:1
ill => ll 差异度:1
implement => implements 差异度:1
in => int 差异度:1
in => isn 差异度:1
include => included 差异度:1
include => includes 差异度:1
indicate => indicated 差异度:1
int => into 差异度:1
int => it 差异度:1
inte易做图ce => inte易做图ces 差异度:1
invoke => invoked 差异度:1
is => isn 差异度:1
is => its 差异度:1
it => its 差异度:1
it => kit 差异度:1
item => items 差异度:1
job => jobs 差异度:1
k => kb 差异度:1
know => known 差异度:1
late => later 差异度:1
level => levels 差异度:1
limit => limits 差异度:1
look => looks 差异度:1
m => mb 差异度:1
m => ms 差异度:1
machine => machines 差异度:1
make => makes 差异度:1
many => may 差异度:1
megabyte => megabytes 差异度:1
menu => menus 差异度:1
microprocessor => microprocessors 差异度:1
mode => model 差异度:1
mode => modes 差异度:1
model => modela 差异度:1
a => add 差异度:2
a => all 差异度:2
a => and 差异度:2
a => any 差异度:2
a => api 差异度:2
a => are 差异度:2
a => bar 差异度:2
a => c 差异度:2
a => can 差异度:2
a => far 差异度:2
a => h 差异度:2
a => had 差异度:2
a => has 差异度:2
a => i 差异度:2
a => j 差异度:2
a => k 差异度:2
a => m 差异度:2
a => mac 差异度:2
a => may 差异度:2
able => be 差异度:2
about => but 差异度:2
access => accessed 差异度:2
add => added 差异度:2
add => and 差异度:2
add => did 差异度:2
add => had 差异度:2
address => addresses 差异度:2
after => later 差异度:2
again => against 差异度:2
ahead => had 差异度:2
all => allow 差异度:2
all => calls 差异度:2
请按任意键继续. . .
代码:
public class StringComparer--------------------编程问答-------------------- 跟计算机谈"直观相似度",不如直接给出相似度规则。 --------------------编程问答--------------------
{
// 存储字符串关系的数组
public int[,] relation;
public int firstLen;
public int secondLen;
// 构造函数
public StringComparer(int firstLen, int secondLen)
{
relation = new int[firstLen, secondLen];
}
// 比较字符串
public int CompareString(string first, string second)
{
//first = first.ToLower();
//second = second.ToLower();
//if (first.CompareTo(second) > 0)
//{
// string temp = first;
// first = second;
// second = temp;
//}
InitializeRelation(first, second);
//ShowRelation();
List<List<int>> pathList = new List<List<int>>(firstLen);
FindAllPath(pathList, firstLen - 1);
//ShowPathList(pathList);
int minDistance = int.MaxValue;
//List<int> minPath = null;
foreach (List<int> path in pathList)
{
int distance = CalculateDistance(path);
if (distance < minDistance)
{
minDistance = distance;
//minPath = path;
}
}
//foreach (int i in minPath)
// Console.Write("{0,3}", i);
//Console.WriteLine();
//Console.WriteLine(minDistance);
return minDistance;
}
// 计算指定路径的误差
private int CalculateDistance(List<int> path)
{
int distance = 0;
bool insertState = false;
int insertCount = 0;
int lastPos = -1;
for (int i = 0; i < path.Count; i++)
{
if (path[i] == -1)
{
if (insertState)
insertCount++;
else
{
insertCount = 1;
insertState = true;
}
}
else
{
if (insertState)
{
insertState = false;
distance += insertCount;
}
distance += path[i] - (lastPos + 1);
lastPos = path[i];
}
}
if (insertState)
distance += insertCount;
distance += (secondLen - 1) - lastPos;
return distance;
}
// 找到所有可行路径
private void FindAllPath(List<List<int>> pathList, int processLen)
{
if (processLen < 0)
{
pathList.Add(new List<int>());
return;
}
FindAllPath(pathList, processLen - 1);
List<List<int>> pathListNew = new List<List<int>>();
foreach (List<int> path in pathList)
{
int limit = GetPathMaxValue(path);
for (int i = 0; i < secondLen; i++)
{
List<int> pathNew = new List<int>(path.ToArray());
if (relation[processLen, i] > limit)
{
pathNew.Add(relation[processLen, i]);
pathListNew.Add(pathNew);
}
else
{
pathNew.Add(-1);
pathListNew.Add(pathNew);
break;
}
}
}
pathList.Clear();
pathList.AddRange(pathListNew);
}
// 获得当前路径的最大值
private int GetPathMaxValue(List<int> path)
{
for (int i = path.Count - 1; i >= 0; i--)
if (path[i] != -1)
return path[i];
return -1;
}
// 初始化关系数组
private void InitializeRelation(string first, string second)
{
firstLen = first.Length;
secondLen = second.Length;
for (int i = 0; i < firstLen; i++)
{
int index = 0;
for (int j = secondLen - 1; j >= 0; j--)
{
if (second[j] == first[i])
{
relation[i, index] = j;
index++;
}
}
relation[i, index] = -1;
}
}
// 显示关系数组
private void ShowRelation()
{
for (int i = 0; i < firstLen; i++)
{
for (int j = 0; j < secondLen; j++)
{
Console.Write("{0, 4}", relation[i, j]);
if (relation[i, j] == -1) break;
}
Console.WriteLine();
}
}
// 显示所有路线
private void ShowPathList(List<List<int>> pathList)
{
foreach (List<int> path in pathList)
{
foreach (int i in path)
Console.Write("{0,3}", i);
Console.WriteLine();
}
}
}
class Program
{
static void Main(string[] args)
{
StringComparer cmp = new StringComparer(1000, 1000);
List<string> list = new List<string>();
using (StreamReader reader = new StreamReader("D:\\dict.txt"))
for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
list.Add(line);
Dictionary<string, int> result = new Dictionary<string, int>();
for (int i = 0; i < list.Count; i++)
{
Console.Write("Compare: " + list[i]);
for (int j = i + 1; j < list.Count; j++)
{
int score = cmp.CompareString(list[i], list[j]);
if (score < 3) // 筛选差异度小于3的
result.Add(list[i].PadRight(15) + "=> " + list[j], score);
}
Console.Write("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b");
}
Console.BufferHeight = 3000;
var sortedResult = from p in result
orderby p.Value ascending
select p;
foreach (KeyValuePair<string, int> p in sortedResult)
Console.WriteLine("{0,-40}差异度:{1}", p.Key, p.Value);
}
}
有规则当然就好说了 --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- 貌似这东西,就是要我们建立这个相似度的规则模型。
--------------------编程问答--------------------
我估计是算字符串值的欧式距离! --------------------编程问答--------------------
维度都不一样,欧式距离不太好用啊
我在9楼贴的代码的思想是:
将字符串str1和str2比较,
其实就是寻找str1中每个字符在str2中的对应位置,当然可能有的字符在str2中无法找到对应的
当然这种对应关系很多
比如abc和acb
abc
| |
a cb
或
a bc
| |
acb
就是要找出所有的可能存在的对应,然后计算每种对应的误差
比如
a bc
| |
acb
相对于abc,用acb来匹配的话,
acb在第二个位置有一个插入错误,多个'c',在第四个位置有一个删除错误,少个'c'
不过我目前待代码还没加入另一类错误,替换错误
如
abc
| |
adc
如用adc去匹配abc,用9楼的代码得到的结果就是第二位置有一个插入错误,多'd',还有一个删除错误,少'c',当然这种错误其实可以合并为一个替换错误
所以9楼的代码会认为 (adc和acb)跟abc具有相同的差异度,而实际上adc更像
另外,还欠缺一点的是
a跟abc的差异是2
abcdefg和abcde的差异也是2
但实际上a跟abc的差异要比 abcdefg跟abcde的差异要打
这里需要引入一种相对差异度的概念
--------------------编程问答--------------------
赞。我们的算法应该一样。 --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答--------------------
从举例看,直观相似度的规则似乎是这样的,先是比对同序排列子项的长度,长度大者优,长度相同者,同位比较排列序号小者优。 --------------------编程问答-------------------- --------------------编程问答-------------------- 两个字符串相似度 = 最小编辑距离/最小长度 --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- 用DP动态规划,求A词变化到B词的最少“增删改”步骤即可,是个NP问题,速度不会太快。
我做过比较14万单词的相似度,不过没采用这个办法。
我的办法是记录单词的原子,假设单词长度是n,就是先生成2-n长度的原子表,参考这个
http://topic.csdn.net/u/20070119/09/98F87AC4-06D7-41F6-8F58-1395EC63E7E3.html --------------------编程问答-------------------- --------------------编程问答-------------------- 呃……这不就是编辑距离么…… --------------------编程问答-------------------- 我想知道
和 QUEUE 匹配: OUEUE , GUEUE 哪个优胜...
和 QUEUE 匹配: QUEUe , QUEuE 哪个优胜... --------------------编程问答-------------------- --------------------编程问答-------------------- 恩 给出相似度规则! --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- 每次弄这种字符串的模糊匹配,我都会写一遍lcs(最长公共子序列)的程序,每次用到最后又都会把lcs给删掉,因为经常需要用精确匹配。真是悲剧,只好再写一遍了。
比如abcac同bbdac的lcs是bac,另外还有一种利用dp求编辑距离的方法,也是n^2的复杂度
public static string LCS(string strA, string strB)
{
int m = strA.Length, n = strB.Length;
int[,] matrix = new int[m + 1, n + 1];
for (int i = 1; i <= m; i++)
{
for (int j = 1; j <= n; j++)
{
if (strA[i - 1] == strB[j - 1])
matrix[i, j] = matrix[i - 1, j - 1] + 1;
else
matrix[i, j] = Math.Max(matrix[i, j - 1], matrix[i - 1, j]);
}
}
char[] result = new char[matrix[m, n]];
int currentIndex = result.Length - 1;
while (matrix[m, n] != 0)
{
if (matrix[m, n] == matrix[m, n - 1])
n--;
else if (matrix[m, n] == matrix[m - 1, n])
m--;
else
{
result[currentIndex] = strA[m - 1];
currentIndex--; n--; m--;
}
}
return new string(result);
}
--------------------编程问答-------------------- 和 QUEUE 匹配: OUEUE , GUEUE 哪个优胜...
--------------------编程问答-------------------- --------------------编程问答-------------------- 题的含义不明确啊
lcs吗? --------------------编程问答-------------------- --------------------编程问答-------------------- shortest edit distance 算法
上网自己google吧
用dynamic programming
O(n)复杂度 --------------------编程问答-------------------- +1
--------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- 编程珠玑上有一章讲后缀数组的....好像可以处理相似程度吧。 --------------------编程问答-------------------- --------------------编程问答-------------------- 得照着lz的例子看出匹配规则啊.
23楼得解释挺清楚的。 --------------------编程问答-------------------- 字典下不下来。。 --------------------编程问答-------------------- 各位达人,现在有这么一个随机的字符串,只由字母和数字组成
比如"aa44aa49a44aaaa"
如何取出里面的数字 然后再求和呢? 注意连在一起的数字算做一个
上面的例子"aa44aa49a45aaaa"
最终结果是44+49+45=148
字符串的组成是完全随机的 "44aa49a49aass"..."zs151as12a51s12"..
求解答! --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答-------------------- 看得真晕 --------------------编程问答--------------------
这个貌似不难。 --------------------编程问答-------------------- 就是字符串的相似度问题,下面讲的很好,推荐看!
http://www.cnblogs.com/liushannet/archive/2010/10/25/1860358.html --------------------编程问答-------------------- --------------------编程问答-------------------- 对这个是一点概念都没有.
貌似Beyond Compare这个软件做的比较好 --------------------编程问答-------------------- --------------------编程问答--------------------
用C/C++来做不难
现在用C#来做 有点晕 --------------------编程问答-------------------- 字符串的相似度 --------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答--------------------
正则。。。
--------------------编程问答-------------------- import java.util.regex.*;
public class RegexTest
{
public static void main(String[] args)
{
String str = "zs151as12a51s12";
Pattern p = Pattern.compile("[0-9]{1,}");
Matcher m = p.matcher(str);
int sum = 0;
while(m.find())
{
int start = m.start();
int end = m.end();
System.out.println(start+"\t"+end);
String s = str.substring(start,end);
sum += Integer.parseInt(s);
}
System.out.println(sum);
}
} --------------------编程问答-------------------- 这位仁兄,谢了
java的pattern和macther的确很好用 很简洁
今天用C#解决了 但是看起来依然很别扭
用C#有没有更简洁的?
--------------------编程问答-------------------- --------------------编程问答-------------------- --------------------编程问答--------------------
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace ConsoleApplication1
{
class Program
{
static void Main(string[] args)
{
String str = "a444a44asd343asd41a";
int num = 0;
for (int i = 0; i < str.Length; i++)
{
if (char.IsNumber(str[i]))
{
int j = 0;
String temp = null;
for (j = i; j < str.Length; j++)
{
if (char.IsNumber(str[j]) == false)
break;
temp += str[j].ToString();
}
i = j;
num += Int32.Parse(temp);
Console.Write("提取出来的数字为:");
Console.WriteLine(temp);
}
}
Console.WriteLine("和为:"+num);
Console.Read();
}
}
}
解决了,但总觉得很罗嗦..
有没有牛人来点简洁的? --------------------编程问答-------------------- --------------------编程问答--------------------
补充:.NET技术 , C#