159 lines
3.4 KiB
Go
159 lines
3.4 KiB
Go
package utils
|
|
|
|
import (
|
|
"math"
|
|
"strings"
|
|
"unicode"
|
|
)
|
|
|
|
// CalculateStringSimilarity calculates similarity between two strings using Levenshtein distance
|
|
func CalculateStringSimilarity(s1, s2 string) float64 {
|
|
// Normalize strings
|
|
s1 = normalizeString(s1)
|
|
s2 = normalizeString(s2)
|
|
|
|
if s1 == s2 {
|
|
return 1.0
|
|
}
|
|
|
|
if len(s1) == 0 || len(s2) == 0 {
|
|
return 0.0
|
|
}
|
|
|
|
// Calculate Levenshtein distance
|
|
distance := levenshteinDistance(s1, s2)
|
|
maxLen := math.Max(float64(len(s1)), float64(len(s2)))
|
|
|
|
similarity := 1.0 - (float64(distance) / maxLen)
|
|
return math.Max(0, similarity)
|
|
}
|
|
|
|
// levenshteinDistance calculates the Levenshtein distance between two strings
|
|
func levenshteinDistance(s1, s2 string) int {
|
|
len1 := len(s1)
|
|
len2 := len(s2)
|
|
|
|
// Create a 2D slice for dynamic programming
|
|
dp := make([][]int, len1+1)
|
|
for i := range dp {
|
|
dp[i] = make([]int, len2+1)
|
|
}
|
|
|
|
// Initialize first row and column
|
|
for i := 0; i <= len1; i++ {
|
|
dp[i][0] = i
|
|
}
|
|
for j := 0; j <= len2; j++ {
|
|
dp[0][j] = j
|
|
}
|
|
|
|
// Fill the dp table
|
|
for i := 1; i <= len1; i++ {
|
|
for j := 1; j <= len2; j++ {
|
|
cost := 0
|
|
if s1[i-1] != s2[j-1] {
|
|
cost = 1
|
|
}
|
|
|
|
dp[i][j] = min3(
|
|
dp[i-1][j]+1, // deletion
|
|
dp[i][j-1]+1, // insertion
|
|
dp[i-1][j-1]+cost, // substitution
|
|
)
|
|
}
|
|
}
|
|
|
|
return dp[len1][len2]
|
|
}
|
|
|
|
// ExtractKeywords extracts keywords from a string
|
|
func ExtractKeywords(text string) []string {
|
|
// Normalize and split text
|
|
text = normalizeString(text)
|
|
words := strings.Fields(text)
|
|
|
|
// Filter stopwords and short words
|
|
var keywords []string
|
|
stopwords := getStopwords()
|
|
|
|
for _, word := range words {
|
|
if len(word) > 2 && !contains(stopwords, word) {
|
|
keywords = append(keywords, word)
|
|
}
|
|
}
|
|
|
|
return keywords
|
|
}
|
|
|
|
// FindMatchedKeywords finds common keywords between two lists
|
|
func FindMatchedKeywords(keywords1, keywords2 []string) []string {
|
|
var matched []string
|
|
|
|
for _, k1 := range keywords1 {
|
|
for _, k2 := range keywords2 {
|
|
if strings.EqualFold(k1, k2) || CalculateStringSimilarity(k1, k2) > 0.8 {
|
|
if !contains(matched, k1) {
|
|
matched = append(matched, k1)
|
|
}
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
return matched
|
|
}
|
|
|
|
// normalizeString normalizes a string (lowercase, remove extra spaces)
|
|
func normalizeString(s string) string {
|
|
// Convert to lowercase
|
|
s = strings.ToLower(s)
|
|
|
|
// Remove punctuation and extra spaces
|
|
var result strings.Builder
|
|
for _, r := range s {
|
|
if unicode.IsLetter(r) || unicode.IsDigit(r) || unicode.IsSpace(r) {
|
|
result.WriteRune(r)
|
|
} else {
|
|
result.WriteRune(' ')
|
|
}
|
|
}
|
|
|
|
// Remove multiple spaces
|
|
s = strings.Join(strings.Fields(result.String()), " ")
|
|
|
|
return strings.TrimSpace(s)
|
|
}
|
|
|
|
// getStopwords returns common Indonesian stopwords
|
|
func getStopwords() []string {
|
|
return []string{
|
|
"dan", "atau", "dengan", "untuk", "dari", "ke", "di", "yang", "ini", "itu",
|
|
"ada", "adalah", "akan", "telah", "sudah", "pada", "oleh", "sebagai", "dalam",
|
|
"juga", "saya", "kamu", "dia", "kita", "mereka", "kami", "the", "a", "an",
|
|
"of", "to", "in", "for", "on", "at", "by", "with", "from",
|
|
}
|
|
}
|
|
|
|
// contains checks if a slice contains a string
|
|
func contains(slice []string, str string) bool {
|
|
for _, s := range slice {
|
|
if strings.EqualFold(s, str) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// min3 returns the minimum of three integers
|
|
func min3(a, b, c int) int {
|
|
if a < b {
|
|
if a < c {
|
|
return a
|
|
}
|
|
return c
|
|
}
|
|
if b < c {
|
|
return b
|
|
}
|
|
return c
|
|
} |