You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

confusables.go 2.0KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. //go:generate go run maketables.go > tables.go
  2. package confusables
  3. import (
  4. "bytes"
  5. "golang.org/x/text/unicode/norm"
  6. )
  7. // TODO: document casefolding approaches
  8. // (suggest to force casefold strings; explain how to catch paypal - pAypal)
  9. // TODO: DOC you might want to store the Skeleton and check against it later
  10. // TODO: implement xidmodifications.txt restricted characters
  11. type lookupFunc func(rune) (string)
  12. func lookupReplacement(r rune) string {
  13. return confusablesMap[r]
  14. }
  15. func lookupReplacementTweaked(r rune) string {
  16. if replacement, ok := tweaksMap[r]; ok {
  17. return replacement
  18. }
  19. return confusablesMap[r]
  20. }
  21. func skeletonBase(s string, lookup lookupFunc) string {
  22. // 1. Converting X to NFD format
  23. s = norm.NFD.String(s)
  24. // 2. Successively mapping each source character in X to the target string
  25. // according to the specified data table
  26. var buf bytes.Buffer
  27. changed := false // fast path: if this remains false, keep s intact
  28. prevPos := 0
  29. var replacement string
  30. for i, r := range s {
  31. if changed && replacement == "" {
  32. buf.WriteString(s[prevPos:i])
  33. }
  34. prevPos = i
  35. replacement = lookup(r)
  36. if replacement != "" {
  37. if !changed {
  38. changed = true
  39. // first replacement: copy over the previously unmodified text
  40. buf.WriteString(s[:i])
  41. }
  42. buf.WriteString(replacement)
  43. }
  44. }
  45. if changed && replacement == "" {
  46. buf.WriteString(s[prevPos:]) // loop-and-a-half
  47. }
  48. if changed {
  49. s = buf.String()
  50. }
  51. // 3. Reapplying NFD
  52. s = norm.NFD.String(s)
  53. return s
  54. }
  55. // Skeleton converts a string to its "skeleton" form
  56. // as described in http://www.unicode.org/reports/tr39/#Confusable_Detection
  57. func Skeleton(s string) string {
  58. return skeletonBase(s, lookupReplacement)
  59. }
  60. // SkeletonTweaked is like Skeleton, but it implements some custom overrides
  61. // to the confusables table (currently it removes the m -> rn mapping):
  62. func SkeletonTweaked(s string) string {
  63. return skeletonBase(s, lookupReplacementTweaked)
  64. }
  65. func Confusable(x, y string) bool {
  66. return Skeleton(x) == Skeleton(y)
  67. }