sample\algorithms\01_string_distance.hsp » Plain Format
;============================================================
; アルゴリズム 01〜05: 文字列距離 / 類似度
; 01. Levenshtein (編集距離, DP)
; 02. Hamming 距離
; 03. Jaro-Winkler 類似度
; 04. SoundEx (英語発音コード)
; 05. Bigram Jaccard 類似度
;============================================================
#include "hsp3cl_net_64.as"
; ----- 01. Levenshtein 距離 -----
#module
#defcfunc levenshtein var _a, var _b, \
local _i, local _j, local _la, local _lb, local _ca, local _cb, local _cost, local _m, local _n
_la = strlen(_a) : _lb = strlen(_b)
if _la == 0 : return _lb
if _lb == 0 : return _la
dim dp, (_la + 1) * (_lb + 1)
repeat _la + 1 : dp(cnt) = cnt : loop
repeat _lb + 1 : dp(cnt * (_la + 1)) = cnt : loop
repeat _lb
_j = cnt + 1
repeat _la
_i = cnt + 1
_ca = peek(_a, _i - 1)
_cb = peek(_b, _j - 1)
_cost = 0 : if _ca != _cb : _cost = 1
_m = dp((_j) * (_la + 1) + _i - 1) + 1
_n = dp((_j - 1) * (_la + 1) + _i) + 1
if _n < _m : _m = _n
_n = dp((_j - 1) * (_la + 1) + _i - 1) + _cost
if _n < _m : _m = _n
dp(_j * (_la + 1) + _i) = _m
loop
loop
return dp(_lb * (_la + 1) + _la)
#global
; ----- 02. Hamming 距離 -----
#module
#defcfunc hamming var _a, var _b, local _n, local _d
_n = strlen(_a)
if _n != strlen(_b) : return -1
_d = 0
repeat _n
if peek(_a, cnt) != peek(_b, cnt) : _d++
loop
return _d
#global
; ----- 03. Jaro-Winkler (×1000 で返却) -----
#module
#defcfunc jaro_winkler_1000 var _a, var _b, \
local _la, local _lb, local _i, local _j, local _match, local _trans, \
local _half, local _lo, local _hi, local _prefix, local _jaro, local _found
_la = strlen(_a) : _lb = strlen(_b)
if (_la == 0) | (_lb == 0) : return 0
_half = _la
if _lb > _half : _half = _lb
_half = (_half / 2) - 1 : if _half < 0 : _half = 0
dim am, _la : dim bm, _lb
_match = 0
repeat _la
_i = cnt
_lo = _i - _half : if _lo < 0 : _lo = 0
_hi = _i + _half + 1 : if _hi > _lb : _hi = _lb
_found = 0
repeat _hi - _lo
_j = _lo + cnt
if _found : continue
if bm(_j) == 0 {
if peek(_a, _i) == peek(_b, _j) {
am(_i) = 1 : bm(_j) = 1 : _match++
_found = 1
}
}
loop
loop
if _match == 0 : return 0
_trans = 0 : _j = 0
repeat _la
_i = cnt
if am(_i) {
repeat : if bm(_j) : break : else : _j++
loop
if peek(_a, _i) != peek(_b, _j) : _trans++
_j++
}
loop
_trans /= 2
_jaro = ((1000 * _match / _la) + (1000 * _match / _lb) + (1000 * (_match - _trans) / _match)) / 3
_prefix = 0
repeat 4
if cnt >= _la : break
if cnt >= _lb : break
if peek(_a, cnt) != peek(_b, cnt) : break
_prefix++
loop
return _jaro + _prefix * (1000 - _jaro) / 10
#global
; ----- 04. SoundEx -----
#module
#defcfunc soundex var _s, local _c, local _i, local _prev, local _digit, local _out, local _j
if strlen(_s) == 0 : return ""
_c = peek(_s, 0)
if (_c >= 'a') & (_c <= 'z') : _c -= 32
sdim _out, 8
poke _out, 0, _c
_prev = -1
_i = 1
_j = 1
repeat strlen(_s) - 1
if _i >= 4 : break
_c = peek(_s, _j) : _j++
if (_c >= 'a') & (_c <= 'z') : _c -= 32
if (_c == 'A') | (_c == 'E') | (_c == 'I') | (_c == 'O') | (_c == 'U') | (_c == 'H') | (_c == 'W') | (_c == 'Y') {
_prev = -1
continue
}
_digit = 0
if (_c == 'B') | (_c == 'F') | (_c == 'P') | (_c == 'V') : _digit = 1
if (_c == 'C') | (_c == 'G') | (_c == 'J') | (_c == 'K') | (_c == 'Q') | (_c == 'S') | (_c == 'X') | (_c == 'Z') : _digit = 2
if (_c == 'D') | (_c == 'T') : _digit = 3
if _c == 'L' : _digit = 4
if (_c == 'M') | (_c == 'N') : _digit = 5
if _c == 'R' : _digit = 6
if _digit > 0 {
if _digit != _prev {
poke _out, _i, _digit + '0'
_i++
}
_prev = _digit
} else {
_prev = -1
}
loop
repeat 4 - _i
poke _out, _i, '0' : _i++
loop
poke _out, 4, 0
return _out
#global
; ----- 05. Bigram Jaccard 類似度 (×1000) -----
#module
#defcfunc jaccard_1000 var _a, var _b, \
local _la, local _lb, local _common, local _uni, local _i, local _j, local _found
_la = strlen(_a) - 1 : if _la < 0 : _la = 0
_lb = strlen(_b) - 1 : if _lb < 0 : _lb = 0
if (_la == 0) | (_lb == 0) : return 0
dim used_b, _lb
_common = 0
repeat _la
_i = cnt
_found = 0
repeat _lb
_j = cnt
if _found : continue
if used_b(_j) == 0 {
if (peek(_a, _i) == peek(_b, _j)) & (peek(_a, _i + 1) == peek(_b, _j + 1)) {
_common++ : used_b(_j) = 1 : _found = 1
}
}
loop
loop
_uni = _la + _lb - _common
if _uni == 0 : return 1000
return 1000 * _common / _uni
#global
; ===== デモ =====
mes "=== 文字列距離 / 類似度 ==="
s1 = "kitten" : s2 = "sitting"
mes strf("Levenshtein(kitten, sitting) = %d", levenshtein(s1, s2))
s1 = "hello" : s2 = "world"
mes strf("Levenshtein(hello, world) = %d", levenshtein(s1, s2))
s1 = "karolin" : s2 = "kathrin"
mes strf("Hamming(karolin, kathrin) = %d", hamming(s1, s2))
s1 = "martha" : s2 = "marhta"
jw1 = 1.0 * jaro_winkler_1000(s1, s2) / 1000
mes strf("Jaro-Winkler(martha, marhta) = %.3f", jw1)
s1 = "dixon" : s2 = "dicksonx"
jw2 = 1.0 * jaro_winkler_1000(s1, s2) / 1000
mes strf("Jaro-Winkler(dixon, dicksonx) = %.3f", jw2)
s1 = "Robert" : mes strf("SoundEx(Robert) = %s", soundex(s1))
s1 = "Rupert" : mes strf("SoundEx(Rupert) = %s", soundex(s1))
s1 = "Ashcraft" : mes strf("SoundEx(Ashcraft) = %s", soundex(s1))
s1 = "night" : s2 = "nacht"
jc = 1.0 * jaccard_1000(s1, s2) / 1000
mes strf("Bigram Jaccard(night, nacht) = %.3f", jc)
end 0