2025-11-23 22:49:46 +07:00

160 lines
3.4 KiB
Go

// internal/utils/similarity.go
package utils
import (
"math"
"strings"
"unicode"
)
// CalculateStringSimilarity calculates similarity between two strings using Levenshtein distance
func CalculateStringSimilarity(s1, s2 string) float64 {
// Normalize strings
s1 = normalizeString(s1)
s2 = normalizeString(s2)
if s1 == s2 {
return 1.0
}
if len(s1) == 0 || len(s2) == 0 {
return 0.0
}
// Calculate Levenshtein distance
distance := levenshteinDistance(s1, s2)
maxLen := math.Max(float64(len(s1)), float64(len(s2)))
similarity := 1.0 - (float64(distance) / maxLen)
return math.Max(0, similarity)
}
// levenshteinDistance calculates the Levenshtein distance between two strings
func levenshteinDistance(s1, s2 string) int {
len1 := len(s1)
len2 := len(s2)
// Create a 2D slice for dynamic programming
dp := make([][]int, len1+1)
for i := range dp {
dp[i] = make([]int, len2+1)
}
// Initialize first row and column
for i := 0; i <= len1; i++ {
dp[i][0] = i
}
for j := 0; j <= len2; j++ {
dp[0][j] = j
}
// Fill the dp table
for i := 1; i <= len1; i++ {
for j := 1; j <= len2; j++ {
cost := 0
if s1[i-1] != s2[j-1] {
cost = 1
}
dp[i][j] = min3(
dp[i-1][j]+1, // deletion
dp[i][j-1]+1, // insertion
dp[i-1][j-1]+cost, // substitution
)
}
}
return dp[len1][len2]
}
// ExtractKeywords extracts keywords from a string
func ExtractKeywords(text string) []string {
// Normalize and split text
text = normalizeString(text)
words := strings.Fields(text)
// Filter stopwords and short words
var keywords []string
stopwords := getStopwords()
for _, word := range words {
if len(word) > 2 && !contains(stopwords, word) {
keywords = append(keywords, word)
}
}
return keywords
}
// FindMatchedKeywords finds common keywords between two lists
func FindMatchedKeywords(keywords1, keywords2 []string) []string {
var matched []string
for _, k1 := range keywords1 {
for _, k2 := range keywords2 {
if strings.EqualFold(k1, k2) || CalculateStringSimilarity(k1, k2) > 0.8 {
if !contains(matched, k1) {
matched = append(matched, k1)
}
break
}
}
}
return matched
}
// normalizeString normalizes a string (lowercase, remove extra spaces)
func normalizeString(s string) string {
// Convert to lowercase
s = strings.ToLower(s)
// Remove punctuation and extra spaces
var result strings.Builder
for _, r := range s {
if unicode.IsLetter(r) || unicode.IsDigit(r) || unicode.IsSpace(r) {
result.WriteRune(r)
} else {
result.WriteRune(' ')
}
}
// Remove multiple spaces
s = strings.Join(strings.Fields(result.String()), " ")
return strings.TrimSpace(s)
}
// getStopwords returns common Indonesian stopwords
func getStopwords() []string {
return []string{
"dan", "atau", "dengan", "untuk", "dari", "ke", "di", "yang", "ini", "itu",
"ada", "adalah", "akan", "telah", "sudah", "pada", "oleh", "sebagai", "dalam",
"juga", "saya", "kamu", "dia", "kita", "mereka", "kami", "the", "a", "an",
"of", "to", "in", "for", "on", "at", "by", "with", "from",
}
}
// contains checks if a slice contains a string
func contains(slice []string, str string) bool {
for _, s := range slice {
if strings.EqualFold(s, str) {
return true
}
}
return false
}
// min3 returns the minimum of three integers
func min3(a, b, c int) int {
if a < b {
if a < c {
return a
}
return c
}
if b < c {
return b
}
return c
}