
批處理計算字符串相似度看到這個貼子http://www.bathome.net/thread-69920-1-1.html有字符串相似的相關問題,于是結合距離和公共子串序相關的算法寫了三種方法計算字符相似度,各有側重
nclick="copycode($('code0'));">復制代碼
- @echo off
- call :sim abcd你好123 bd好運1314
- call :sim abcd你好123 abcd你好132&
- pause&exit
- :sim
- if %~1== if not %~2== (echo,---0%%---&exit /b)
- if %~2== if not %~1== (echo,---0%%---&exit /b)
- if %~2== if %~1== (echo,---100%%---&exit /b)
- set str1=%~1&set _str1=%~1#
- set str2=%~2&set _str2=%~2#
- setlocal enabledelayedexpansion
- for %%i in (2048 1024 512 256 128 64 32 16 8 4 2 1) do (
- if not !_str1:~%%i,1!== (set /a len1+=%%i&set _str1=!_str1:~%%i!)
- if not !_str2:~%%i,1!== (set /a len2+=%%i&set _str2=!_str2:~%%i!)
- )
- set /a _len1_=len1+1,_len2_=len2+1
- setlocal
- for /l %%i in (0,1,%_len1_%) do set /a count[%%i][0]=%%i
- for /l %%i in (0,1,%_len2_%) do set /a count[0][%%i]=%%i
- for /l %%i in (1,1,%_len1_%) do (
- for /l %%j in (1,1,%_len2_%) do (
- set /a ci=%%i-1,cj=%%j
- set /a c1=count[!ci!][!cj!]+1
- set /a ci=%%i,cj=%%j-1
- set /a c2=count[!ci!][!cj!]+1
- set /a ci=%%i-1,cj=%%j-1
- set /a c3=count[!ci!][!cj!]
- set /a ii=%%i-1,jj=%%j-1
- call :cut !ii! !jj!
- if not !s1!==!s2! set /a c3+=1
- if !c1! leq !c2! (set /a min=c1) else set /a min=c2
- if !c3! leq !min! (set /a count[%%i][%%j]=c3) else set /a count[%%i][%%j]=min
- )
- )
- set /a dist=count[%len1%][%len2%]
- endlocal&set /a dist=%dist%
- setlocal
- for /l %%i in (1,1,%_len1_%) do (
- for /l %%j in (1,1,%_len2_%) do (
- set /a ii=%%i-1,jj=%%j-1
- call :cut !ii! !jj!
- set /a ci=%%i-1,cj=%%j-1
- if !s1!==!s2! (
- set /a count[%%i][%%j]=count[!ci!][!cj!]+1
- ) else (
- set /a c1=count[!ci!][!cj!]
- set /a ci=%%i-1,cj=%%j
- set /a c2=count[!ci!][!cj!]
- set /a ci=%%i,cj=%%j-1
- set /a c3=count[!ci!][!cj!]
- if !c1! geq !c2! (set /a max=c1) else set /a max=c2
- if !c3! geq !max! (set /a count[%%i][%%j]=c3) else set /a count[%%i][%%j]=max
- )
- )
- )
- set /a LCS=count[%len1%][%len2%]
- endlocal&set /a LCS=%LCS%
- setlocal
- set /a matches=0
- if %len1% geq %len2% (
- set maxStr=!str1!&set minStr=!str2!
- set /a maxLen=len1,minLen=len2
- ) else (
- set maxStr=!str2!&set minStr=!str1!
- set /a maxLen=len2,minLen=len1
- )
- set /a match_max=maxLen/2-1
- if !match_max! leq 0 set /a match_max=0
- set /a _minLen_=minLen-1
- for /l %%i in (0,1,%_minLen_%) do (
- set minChar=!minStr:~%%i,1!
- set /a jj=%%i-match_max
- if !jj! leq 0 set /a jj=0
- set /a jj_lim=%%i+match_max+1
- if !jj_lim! geq !maxLen! set /a jj_lim=maxLen
- set /a jj_lim-=1
- for /l %%j in (!jj!,1,!jj_lim!) do (
- set /a maxflag[%%j]=maxflag[%%j]
- set maxChar=!maxStr:~%%j,1!
- if !flag[%%j]! equ 0 if !minChar!==!maxChar! (set /a maxflag[%%j]=1,minflag[%%i]=1,matches+=1)
- )
- )
- set /a trans=jj=0
- for /l %%i in (0,1,%_minLen_%) do (
- if !minflag[%%i]! equ 1 (
- for /l %%j in (!jj!,1,!jj_lim!) do if !maxflag[%%j]! equ 0 (set /a jj+=1)
- for %%j in (!jj!) do if not !minStr:~%%i,1!==!maxStr:~%%j,1! (set /a trans+=1)
- set /a jj+=1
- )
- )
- set /a trans/=2
- if %matches% equ 0 (endlocal&set/a simPCT3=0&goto :end)
- set /a jaro=(matches*100/len1+matches*100/len2+((matches-trans)*100/matches))/3
- endlocal&set /a simPCT3=%jaro%
- :end
- if %len1% geq %len2% (set /a maxLen=len1) else set /a maxLen=len2
- set /a simPCT1=LCS*100/maxLen
- set /a simPCT2=LCS*100/(%dist%+%LCS%)
- echo,---%simPCT1%%%---%simPCT2%%%---%simPCT3%%%---!str1!與!str2!相似度
- exit /b
- :cut
- set s1=!str1:~%1,1!
- set s2=!str2:~%2,1!

