You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

strings.go 4.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. // Copyright (c) 2012-2014 Jeremy Latt
  2. // Copyright (c) 2014-2015 Edmund Huber
  3. // Copyright (c) 2016-2017 Daniel Oaks <daniel@danieloaks.net>
  4. // released under the MIT license
  5. package irc
  6. import (
  7. "strings"
  8. "github.com/oragono/confusables"
  9. "golang.org/x/text/cases"
  10. "golang.org/x/text/language"
  11. "golang.org/x/text/secure/precis"
  12. "golang.org/x/text/width"
  13. )
  14. const (
  15. casemappingName = "rfc8265"
  16. )
  17. // Each pass of PRECIS casefolding is a composition of idempotent operations,
  18. // but not idempotent itself. Therefore, the spec says "do it four times and hope
  19. // it converges" (lolwtf). Golang's PRECIS implementation has a "repeat" option,
  20. // which provides this functionality, but unfortunately it's not exposed publicly.
  21. func iterateFolding(profile *precis.Profile, oldStr string) (str string, err error) {
  22. str = oldStr
  23. // follow the stabilizing rules laid out here:
  24. // https://tools.ietf.org/html/draft-ietf-precis-7564bis-10.html#section-7
  25. for i := 0; i < 4; i++ {
  26. str, err = profile.CompareKey(str)
  27. if err != nil {
  28. return "", err
  29. }
  30. if oldStr == str {
  31. break
  32. }
  33. oldStr = str
  34. }
  35. if oldStr != str {
  36. return "", errCouldNotStabilize
  37. }
  38. return str, nil
  39. }
  40. // Casefold returns a casefolded string, without doing any name or channel character checks.
  41. func Casefold(str string) (string, error) {
  42. return iterateFolding(precis.UsernameCaseMapped, str)
  43. }
  44. // CasefoldChannel returns a casefolded version of a channel name.
  45. func CasefoldChannel(name string) (string, error) {
  46. if len(name) == 0 {
  47. return "", errStringIsEmpty
  48. }
  49. // don't casefold the preceding #'s
  50. var start int
  51. for start = 0; start < len(name) && name[start] == '#'; start += 1 {
  52. }
  53. if start == 0 {
  54. // no preceding #'s
  55. return "", errInvalidCharacter
  56. }
  57. lowered, err := Casefold(name[start:])
  58. if err != nil {
  59. return "", err
  60. }
  61. // space can't be used
  62. // , is used as a separator
  63. // * is used in mask matching
  64. // ? is used in mask matching
  65. if strings.ContainsAny(lowered, " ,*?") {
  66. return "", errInvalidCharacter
  67. }
  68. return name[:start] + lowered, err
  69. }
  70. // CasefoldName returns a casefolded version of a nick/user name.
  71. func CasefoldName(name string) (string, error) {
  72. lowered, err := Casefold(name)
  73. if err != nil {
  74. return "", err
  75. } else if len(lowered) == 0 {
  76. return "", errStringIsEmpty
  77. }
  78. // space can't be used
  79. // , is used as a separator
  80. // * is used in mask matching
  81. // ? is used in mask matching
  82. // . denotes a server name
  83. // ! separates nickname from username
  84. // @ separates username from hostname
  85. // : means trailing
  86. // # is a channel prefix
  87. // ~&@%+ are channel membership prefixes
  88. // - I feel like disallowing
  89. if strings.ContainsAny(lowered, " ,*?.!@:") || strings.ContainsAny(string(lowered[0]), "#~&@%+-") {
  90. return "", errInvalidCharacter
  91. }
  92. return lowered, err
  93. }
  94. // "boring" names are exempt from skeletonization.
  95. // this is because confusables.txt considers various pure ASCII alphanumeric
  96. // strings confusable: 0 and O, 1 and l, m and rn. IMO this causes more problems
  97. // than it solves.
  98. func isBoring(name string) bool {
  99. for i := 0; i < len(name); i += 1 {
  100. chr := name[i]
  101. if (chr >= 'a' && chr <= 'z') || (chr >= 'A' && chr <= 'Z') || (chr >= '0' && chr <= '9') {
  102. continue // alphanumerics
  103. }
  104. switch chr {
  105. case '$', '%', '^', '&', '(', ')', '{', '}', '[', ']', '<', '>', '=':
  106. continue // benign printable ascii characters
  107. default:
  108. return false // potentially confusable ascii like | ' `, non-ascii
  109. }
  110. }
  111. return true
  112. }
  113. // Skeleton produces a canonicalized identifier that tries to catch
  114. // homoglyphic / confusable identifiers. It's a tweaked version of the TR39
  115. // skeleton algorithm. We apply the skeleton algorithm first and only then casefold,
  116. // because casefolding first would lose some information about visual confusability.
  117. // This has the weird consequence that the skeleton is not a function of the
  118. // casefolded identifier --- therefore it must always be computed
  119. // from the original (unfolded) identifier and stored/tracked separately from the
  120. // casefolded identifier.
  121. func Skeleton(name string) (string, error) {
  122. if !isBoring(name) {
  123. name = confusables.Skeleton(name)
  124. }
  125. // XXX the confusables table includes some, but not all, fullwidth->standard
  126. // mappings for latin characters. do a pass of explicit width folding,
  127. // same as PRECIS:
  128. name = width.Fold.String(name)
  129. // internationalized lowercasing for skeletons; this is much more lenient than
  130. // Casefold. In particular, skeletons are expected to mix scripts (which may
  131. // violate the bidi rule). We also don't care if they contain runes
  132. // that are disallowed by PRECIS, because every identifier must independently
  133. // pass PRECIS --- we are just further canonicalizing the skeleton.
  134. return cases.Lower(language.Und).String(name), nil
  135. }