You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

735 lines
25 KiB

  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package language
  5. import (
  6. "errors"
  7. "strings"
  8. "golang.org/x/text/internal/language"
  9. )
  10. // A MatchOption configures a Matcher.
  11. type MatchOption func(*matcher)
  12. // PreferSameScript will, in the absence of a match, result in the first
  13. // preferred tag with the same script as a supported tag to match this supported
  14. // tag. The default is currently true, but this may change in the future.
  15. func PreferSameScript(preferSame bool) MatchOption {
  16. return func(m *matcher) { m.preferSameScript = preferSame }
  17. }
  18. // TODO(v1.0.0): consider making Matcher a concrete type, instead of interface.
  19. // There doesn't seem to be too much need for multiple types.
  20. // Making it a concrete type allows MatchStrings to be a method, which will
  21. // improve its discoverability.
  22. // MatchStrings parses and matches the given strings until one of them matches
  23. // the language in the Matcher. A string may be an Accept-Language header as
  24. // handled by ParseAcceptLanguage. The default language is returned if no
  25. // other language matched.
  26. func MatchStrings(m Matcher, lang ...string) (tag Tag, index int) {
  27. for _, accept := range lang {
  28. desired, _, err := ParseAcceptLanguage(accept)
  29. if err != nil {
  30. continue
  31. }
  32. if tag, index, conf := m.Match(desired...); conf != No {
  33. return tag, index
  34. }
  35. }
  36. tag, index, _ = m.Match()
  37. return
  38. }
  39. // Matcher is the interface that wraps the Match method.
  40. //
  41. // Match returns the best match for any of the given tags, along with
  42. // a unique index associated with the returned tag and a confidence
  43. // score.
  44. type Matcher interface {
  45. Match(t ...Tag) (tag Tag, index int, c Confidence)
  46. }
  47. // Comprehends reports the confidence score for a speaker of a given language
  48. // to being able to comprehend the written form of an alternative language.
  49. func Comprehends(speaker, alternative Tag) Confidence {
  50. _, _, c := NewMatcher([]Tag{alternative}).Match(speaker)
  51. return c
  52. }
  53. // NewMatcher returns a Matcher that matches an ordered list of preferred tags
  54. // against a list of supported tags based on written intelligibility, closeness
  55. // of dialect, equivalence of subtags and various other rules. It is initialized
  56. // with the list of supported tags. The first element is used as the default
  57. // value in case no match is found.
  58. //
  59. // Its Match method matches the first of the given Tags to reach a certain
  60. // confidence threshold. The tags passed to Match should therefore be specified
  61. // in order of preference. Extensions are ignored for matching.
  62. //
  63. // The index returned by the Match method corresponds to the index of the
  64. // matched tag in t, but is augmented with the Unicode extension ('u')of the
  65. // corresponding preferred tag. This allows user locale options to be passed
  66. // transparently.
  67. func NewMatcher(t []Tag, options ...MatchOption) Matcher {
  68. return newMatcher(t, options)
  69. }
  70. func (m *matcher) Match(want ...Tag) (t Tag, index int, c Confidence) {
  71. var tt language.Tag
  72. match, w, c := m.getBest(want...)
  73. if match != nil {
  74. tt, index = match.tag, match.index
  75. } else {
  76. // TODO: this should be an option
  77. tt = m.default_.tag
  78. if m.preferSameScript {
  79. outer:
  80. for _, w := range want {
  81. script, _ := w.Script()
  82. if script.scriptID == 0 {
  83. // Don't do anything if there is no script, such as with
  84. // private subtags.
  85. continue
  86. }
  87. for i, h := range m.supported {
  88. if script.scriptID == h.maxScript {
  89. tt, index = h.tag, i
  90. break outer
  91. }
  92. }
  93. }
  94. }
  95. // TODO: select first language tag based on script.
  96. }
  97. if w.RegionID != tt.RegionID && w.RegionID != 0 {
  98. if w.RegionID != 0 && tt.RegionID != 0 && tt.RegionID.Contains(w.RegionID) {
  99. tt.RegionID = w.RegionID
  100. tt.RemakeString()
  101. } else if r := w.RegionID.String(); len(r) == 2 {
  102. // TODO: also filter macro and deprecated.
  103. tt, _ = tt.SetTypeForKey("rg", strings.ToLower(r)+"zzzz")
  104. }
  105. }
  106. // Copy options from the user-provided tag into the result tag. This is hard
  107. // to do after the fact, so we do it here.
  108. // TODO: add in alternative variants to -u-va-.
  109. // TODO: add preferred region to -u-rg-.
  110. if e := w.Extensions(); len(e) > 0 {
  111. b := language.Builder{}
  112. b.SetTag(tt)
  113. for _, e := range e {
  114. b.AddExt(e)
  115. }
  116. tt = b.Make()
  117. }
  118. return makeTag(tt), index, c
  119. }
  120. // ErrMissingLikelyTagsData indicates no information was available
  121. // to compute likely values of missing tags.
  122. var ErrMissingLikelyTagsData = errors.New("missing likely tags data")
  123. // func (t *Tag) setTagsFrom(id Tag) {
  124. // t.LangID = id.LangID
  125. // t.ScriptID = id.ScriptID
  126. // t.RegionID = id.RegionID
  127. // }
  128. // Tag Matching
  129. // CLDR defines an algorithm for finding the best match between two sets of language
  130. // tags. The basic algorithm defines how to score a possible match and then find
  131. // the match with the best score
  132. // (see https://www.unicode.org/reports/tr35/#LanguageMatching).
  133. // Using scoring has several disadvantages. The scoring obfuscates the importance of
  134. // the various factors considered, making the algorithm harder to understand. Using
  135. // scoring also requires the full score to be computed for each pair of tags.
  136. //
  137. // We will use a different algorithm which aims to have the following properties:
  138. // - clarity on the precedence of the various selection factors, and
  139. // - improved performance by allowing early termination of a comparison.
  140. //
  141. // Matching algorithm (overview)
  142. // Input:
  143. // - supported: a set of supported tags
  144. // - default: the default tag to return in case there is no match
  145. // - desired: list of desired tags, ordered by preference, starting with
  146. // the most-preferred.
  147. //
  148. // Algorithm:
  149. // 1) Set the best match to the lowest confidence level
  150. // 2) For each tag in "desired":
  151. // a) For each tag in "supported":
  152. // 1) compute the match between the two tags.
  153. // 2) if the match is better than the previous best match, replace it
  154. // with the new match. (see next section)
  155. // b) if the current best match is Exact and pin is true the result will be
  156. // frozen to the language found thusfar, although better matches may
  157. // still be found for the same language.
  158. // 3) If the best match so far is below a certain threshold, return "default".
  159. //
  160. // Ranking:
  161. // We use two phases to determine whether one pair of tags are a better match
  162. // than another pair of tags. First, we determine a rough confidence level. If the
  163. // levels are different, the one with the highest confidence wins.
  164. // Second, if the rough confidence levels are identical, we use a set of tie-breaker
  165. // rules.
  166. //
  167. // The confidence level of matching a pair of tags is determined by finding the
  168. // lowest confidence level of any matches of the corresponding subtags (the
  169. // result is deemed as good as its weakest link).
  170. // We define the following levels:
  171. // Exact - An exact match of a subtag, before adding likely subtags.
  172. // MaxExact - An exact match of a subtag, after adding likely subtags.
  173. // [See Note 2].
  174. // High - High level of mutual intelligibility between different subtag
  175. // variants.
  176. // Low - Low level of mutual intelligibility between different subtag
  177. // variants.
  178. // No - No mutual intelligibility.
  179. //
  180. // The following levels can occur for each type of subtag:
  181. // Base: Exact, MaxExact, High, Low, No
  182. // Script: Exact, MaxExact [see Note 3], Low, No
  183. // Region: Exact, MaxExact, High
  184. // Variant: Exact, High
  185. // Private: Exact, No
  186. //
  187. // Any result with a confidence level of Low or higher is deemed a possible match.
  188. // Once a desired tag matches any of the supported tags with a level of MaxExact
  189. // or higher, the next desired tag is not considered (see Step 2.b).
  190. // Note that CLDR provides languageMatching data that defines close equivalence
  191. // classes for base languages, scripts and regions.
  192. //
  193. // Tie-breaking
  194. // If we get the same confidence level for two matches, we apply a sequence of
  195. // tie-breaking rules. The first that succeeds defines the result. The rules are
  196. // applied in the following order.
  197. // 1) Original language was defined and was identical.
  198. // 2) Original region was defined and was identical.
  199. // 3) Distance between two maximized regions was the smallest.
  200. // 4) Original script was defined and was identical.
  201. // 5) Distance from want tag to have tag using the parent relation [see Note 5.]
  202. // If there is still no winner after these rules are applied, the first match
  203. // found wins.
  204. //
  205. // Notes:
  206. // [2] In practice, as matching of Exact is done in a separate phase from
  207. // matching the other levels, we reuse the Exact level to mean MaxExact in
  208. // the second phase. As a consequence, we only need the levels defined by
  209. // the Confidence type. The MaxExact confidence level is mapped to High in
  210. // the public API.
  211. // [3] We do not differentiate between maximized script values that were derived
  212. // from suppressScript versus most likely tag data. We determined that in
  213. // ranking the two, one ranks just after the other. Moreover, the two cannot
  214. // occur concurrently. As a consequence, they are identical for practical
  215. // purposes.
  216. // [4] In case of deprecated, macro-equivalents and legacy mappings, we assign
  217. // the MaxExact level to allow iw vs he to still be a closer match than
  218. // en-AU vs en-US, for example.
  219. // [5] In CLDR a locale inherits fields that are unspecified for this locale
  220. // from its parent. Therefore, if a locale is a parent of another locale,
  221. // it is a strong measure for closeness, especially when no other tie
  222. // breaker rule applies. One could also argue it is inconsistent, for
  223. // example, when pt-AO matches pt (which CLDR equates with pt-BR), even
  224. // though its parent is pt-PT according to the inheritance rules.
  225. //
  226. // Implementation Details:
  227. // There are several performance considerations worth pointing out. Most notably,
  228. // we preprocess as much as possible (within reason) at the time of creation of a
  229. // matcher. This includes:
  230. // - creating a per-language map, which includes data for the raw base language
  231. // and its canonicalized variant (if applicable),
  232. // - expanding entries for the equivalence classes defined in CLDR's
  233. // languageMatch data.
  234. // The per-language map ensures that typically only a very small number of tags
  235. // need to be considered. The pre-expansion of canonicalized subtags and
  236. // equivalence classes reduces the amount of map lookups that need to be done at
  237. // runtime.
  238. // matcher keeps a set of supported language tags, indexed by language.
  239. type matcher struct {
  240. default_ *haveTag
  241. supported []*haveTag
  242. index map[language.Language]*matchHeader
  243. passSettings bool
  244. preferSameScript bool
  245. }
  246. // matchHeader has the lists of tags for exact matches and matches based on
  247. // maximized and canonicalized tags for a given language.
  248. type matchHeader struct {
  249. haveTags []*haveTag
  250. original bool
  251. }
  252. // haveTag holds a supported Tag and its maximized script and region. The maximized
  253. // or canonicalized language is not stored as it is not needed during matching.
  254. type haveTag struct {
  255. tag language.Tag
  256. // index of this tag in the original list of supported tags.
  257. index int
  258. // conf is the maximum confidence that can result from matching this haveTag.
  259. // When conf < Exact this means it was inserted after applying a CLDR equivalence rule.
  260. conf Confidence
  261. // Maximized region and script.
  262. maxRegion language.Region
  263. maxScript language.Script
  264. // altScript may be checked as an alternative match to maxScript. If altScript
  265. // matches, the confidence level for this match is Low. Theoretically there
  266. // could be multiple alternative scripts. This does not occur in practice.
  267. altScript language.Script
  268. // nextMax is the index of the next haveTag with the same maximized tags.
  269. nextMax uint16
  270. }
  271. func makeHaveTag(tag language.Tag, index int) (haveTag, language.Language) {
  272. max := tag
  273. if tag.LangID != 0 || tag.RegionID != 0 || tag.ScriptID != 0 {
  274. max, _ = canonicalize(All, max)
  275. max, _ = max.Maximize()
  276. max.RemakeString()
  277. }
  278. return haveTag{tag, index, Exact, max.RegionID, max.ScriptID, altScript(max.LangID, max.ScriptID), 0}, max.LangID
  279. }
  280. // altScript returns an alternative script that may match the given script with
  281. // a low confidence. At the moment, the langMatch data allows for at most one
  282. // script to map to another and we rely on this to keep the code simple.
  283. func altScript(l language.Language, s language.Script) language.Script {
  284. for _, alt := range matchScript {
  285. // TODO: also match cases where language is not the same.
  286. if (language.Language(alt.wantLang) == l || language.Language(alt.haveLang) == l) &&
  287. language.Script(alt.haveScript) == s {
  288. return language.Script(alt.wantScript)
  289. }
  290. }
  291. return 0
  292. }
  293. // addIfNew adds a haveTag to the list of tags only if it is a unique tag.
  294. // Tags that have the same maximized values are linked by index.
  295. func (h *matchHeader) addIfNew(n haveTag, exact bool) {
  296. h.original = h.original || exact
  297. // Don't add new exact matches.
  298. for _, v := range h.haveTags {
  299. if equalsRest(v.tag, n.tag) {
  300. return
  301. }
  302. }
  303. // Allow duplicate maximized tags, but create a linked list to allow quickly
  304. // comparing the equivalents and bail out.
  305. for i, v := range h.haveTags {
  306. if v.maxScript == n.maxScript &&
  307. v.maxRegion == n.maxRegion &&
  308. v.tag.VariantOrPrivateUseTags() == n.tag.VariantOrPrivateUseTags() {
  309. for h.haveTags[i].nextMax != 0 {
  310. i = int(h.haveTags[i].nextMax)
  311. }
  312. h.haveTags[i].nextMax = uint16(len(h.haveTags))
  313. break
  314. }
  315. }
  316. h.haveTags = append(h.haveTags, &n)
  317. }
  318. // header returns the matchHeader for the given language. It creates one if
  319. // it doesn't already exist.
  320. func (m *matcher) header(l language.Language) *matchHeader {
  321. if h := m.index[l]; h != nil {
  322. return h
  323. }
  324. h := &matchHeader{}
  325. m.index[l] = h
  326. return h
  327. }
  328. func toConf(d uint8) Confidence {
  329. if d <= 10 {
  330. return High
  331. }
  332. if d < 30 {
  333. return Low
  334. }
  335. return No
  336. }
  337. // newMatcher builds an index for the given supported tags and returns it as
  338. // a matcher. It also expands the index by considering various equivalence classes
  339. // for a given tag.
  340. func newMatcher(supported []Tag, options []MatchOption) *matcher {
  341. m := &matcher{
  342. index: make(map[language.Language]*matchHeader),
  343. preferSameScript: true,
  344. }
  345. for _, o := range options {
  346. o(m)
  347. }
  348. if len(supported) == 0 {
  349. m.default_ = &haveTag{}
  350. return m
  351. }
  352. // Add supported languages to the index. Add exact matches first to give
  353. // them precedence.
  354. for i, tag := range supported {
  355. tt := tag.tag()
  356. pair, _ := makeHaveTag(tt, i)
  357. m.header(tt.LangID).addIfNew(pair, true)
  358. m.supported = append(m.supported, &pair)
  359. }
  360. m.default_ = m.header(supported[0].lang()).haveTags[0]
  361. // Keep these in two different loops to support the case that two equivalent
  362. // languages are distinguished, such as iw and he.
  363. for i, tag := range supported {
  364. tt := tag.tag()
  365. pair, max := makeHaveTag(tt, i)
  366. if max != tt.LangID {
  367. m.header(max).addIfNew(pair, true)
  368. }
  369. }
  370. // update is used to add indexes in the map for equivalent languages.
  371. // update will only add entries to original indexes, thus not computing any
  372. // transitive relations.
  373. update := func(want, have uint16, conf Confidence) {
  374. if hh := m.index[language.Language(have)]; hh != nil {
  375. if !hh.original {
  376. return
  377. }
  378. hw := m.header(language.Language(want))
  379. for _, ht := range hh.haveTags {
  380. v := *ht
  381. if conf < v.conf {
  382. v.conf = conf
  383. }
  384. v.nextMax = 0 // this value needs to be recomputed
  385. if v.altScript != 0 {
  386. v.altScript = altScript(language.Language(want), v.maxScript)
  387. }
  388. hw.addIfNew(v, conf == Exact && hh.original)
  389. }
  390. }
  391. }
  392. // Add entries for languages with mutual intelligibility as defined by CLDR's
  393. // languageMatch data.
  394. for _, ml := range matchLang {
  395. update(ml.want, ml.have, toConf(ml.distance))
  396. if !ml.oneway {
  397. update(ml.have, ml.want, toConf(ml.distance))
  398. }
  399. }
  400. // Add entries for possible canonicalizations. This is an optimization to
  401. // ensure that only one map lookup needs to be done at runtime per desired tag.
  402. // First we match deprecated equivalents. If they are perfect equivalents
  403. // (their canonicalization simply substitutes a different language code, but
  404. // nothing else), the match confidence is Exact, otherwise it is High.
  405. for i, lm := range language.AliasMap {
  406. // If deprecated codes match and there is no fiddling with the script or
  407. // or region, we consider it an exact match.
  408. conf := Exact
  409. if language.AliasTypes[i] != language.Macro {
  410. if !isExactEquivalent(language.Language(lm.From)) {
  411. conf = High
  412. }
  413. update(lm.To, lm.From, conf)
  414. }
  415. update(lm.From, lm.To, conf)
  416. }
  417. return m
  418. }
  419. // getBest gets the best matching tag in m for any of the given tags, taking into
  420. // account the order of preference of the given tags.
  421. func (m *matcher) getBest(want ...Tag) (got *haveTag, orig language.Tag, c Confidence) {
  422. best := bestMatch{}
  423. for i, ww := range want {
  424. w := ww.tag()
  425. var max language.Tag
  426. // Check for exact match first.
  427. h := m.index[w.LangID]
  428. if w.LangID != 0 {
  429. if h == nil {
  430. continue
  431. }
  432. // Base language is defined.
  433. max, _ = canonicalize(Legacy|Deprecated|Macro, w)
  434. // A region that is added through canonicalization is stronger than
  435. // a maximized region: set it in the original (e.g. mo -> ro-MD).
  436. if w.RegionID != max.RegionID {
  437. w.RegionID = max.RegionID
  438. }
  439. // TODO: should we do the same for scripts?
  440. // See test case: en, sr, nl ; sh ; sr
  441. max, _ = max.Maximize()
  442. } else {
  443. // Base language is not defined.
  444. if h != nil {
  445. for i := range h.haveTags {
  446. have := h.haveTags[i]
  447. if equalsRest(have.tag, w) {
  448. return have, w, Exact
  449. }
  450. }
  451. }
  452. if w.ScriptID == 0 && w.RegionID == 0 {
  453. // We skip all tags matching und for approximate matching, including
  454. // private tags.
  455. continue
  456. }
  457. max, _ = w.Maximize()
  458. if h = m.index[max.LangID]; h == nil {
  459. continue
  460. }
  461. }
  462. pin := true
  463. for _, t := range want[i+1:] {
  464. if w.LangID == t.lang() {
  465. pin = false
  466. break
  467. }
  468. }
  469. // Check for match based on maximized tag.
  470. for i := range h.haveTags {
  471. have := h.haveTags[i]
  472. best.update(have, w, max.ScriptID, max.RegionID, pin)
  473. if best.conf == Exact {
  474. for have.nextMax != 0 {
  475. have = h.haveTags[have.nextMax]
  476. best.update(have, w, max.ScriptID, max.RegionID, pin)
  477. }
  478. return best.have, best.want, best.conf
  479. }
  480. }
  481. }
  482. if best.conf <= No {
  483. if len(want) != 0 {
  484. return nil, want[0].tag(), No
  485. }
  486. return nil, language.Tag{}, No
  487. }
  488. return best.have, best.want, best.conf
  489. }
  490. // bestMatch accumulates the best match so far.
  491. type bestMatch struct {
  492. have *haveTag
  493. want language.Tag
  494. conf Confidence
  495. pinnedRegion language.Region
  496. pinLanguage bool
  497. sameRegionGroup bool
  498. // Cached results from applying tie-breaking rules.
  499. origLang bool
  500. origReg bool
  501. paradigmReg bool
  502. regGroupDist uint8
  503. origScript bool
  504. }
  505. // update updates the existing best match if the new pair is considered to be a
  506. // better match. To determine if the given pair is a better match, it first
  507. // computes the rough confidence level. If this surpasses the current match, it
  508. // will replace it and update the tie-breaker rule cache. If there is a tie, it
  509. // proceeds with applying a series of tie-breaker rules. If there is no
  510. // conclusive winner after applying the tie-breaker rules, it leaves the current
  511. // match as the preferred match.
  512. //
  513. // If pin is true and have and tag are a strong match, it will henceforth only
  514. // consider matches for this language. This corresponds to the nothing that most
  515. // users have a strong preference for the first defined language. A user can
  516. // still prefer a second language over a dialect of the preferred language by
  517. // explicitly specifying dialects, e.g. "en, nl, en-GB". In this case pin should
  518. // be false.
  519. func (m *bestMatch) update(have *haveTag, tag language.Tag, maxScript language.Script, maxRegion language.Region, pin bool) {
  520. // Bail if the maximum attainable confidence is below that of the current best match.
  521. c := have.conf
  522. if c < m.conf {
  523. return
  524. }
  525. // Don't change the language once we already have found an exact match.
  526. if m.pinLanguage && tag.LangID != m.want.LangID {
  527. return
  528. }
  529. // Pin the region group if we are comparing tags for the same language.
  530. if tag.LangID == m.want.LangID && m.sameRegionGroup {
  531. _, sameGroup := regionGroupDist(m.pinnedRegion, have.maxRegion, have.maxScript, m.want.LangID)
  532. if !sameGroup {
  533. return
  534. }
  535. }
  536. if c == Exact && have.maxScript == maxScript {
  537. // If there is another language and then another entry of this language,
  538. // don't pin anything, otherwise pin the language.
  539. m.pinLanguage = pin
  540. }
  541. if equalsRest(have.tag, tag) {
  542. } else if have.maxScript != maxScript {
  543. // There is usually very little comprehension between different scripts.
  544. // In a few cases there may still be Low comprehension. This possibility
  545. // is pre-computed and stored in have.altScript.
  546. if Low < m.conf || have.altScript != maxScript {
  547. return
  548. }
  549. c = Low
  550. } else if have.maxRegion != maxRegion {
  551. if High < c {
  552. // There is usually a small difference between languages across regions.
  553. c = High
  554. }
  555. }
  556. // We store the results of the computations of the tie-breaker rules along
  557. // with the best match. There is no need to do the checks once we determine
  558. // we have a winner, but we do still need to do the tie-breaker computations.
  559. // We use "beaten" to keep track if we still need to do the checks.
  560. beaten := false // true if the new pair defeats the current one.
  561. if c != m.conf {
  562. if c < m.conf {
  563. return
  564. }
  565. beaten = true
  566. }
  567. // Tie-breaker rules:
  568. // We prefer if the pre-maximized language was specified and identical.
  569. origLang := have.tag.LangID == tag.LangID && tag.LangID != 0
  570. if !beaten && m.origLang != origLang {
  571. if m.origLang {
  572. return
  573. }
  574. beaten = true
  575. }
  576. // We prefer if the pre-maximized region was specified and identical.
  577. origReg := have.tag.RegionID == tag.RegionID && tag.RegionID != 0
  578. if !beaten && m.origReg != origReg {
  579. if m.origReg {
  580. return
  581. }
  582. beaten = true
  583. }
  584. regGroupDist, sameGroup := regionGroupDist(have.maxRegion, maxRegion, maxScript, tag.LangID)
  585. if !beaten && m.regGroupDist != regGroupDist {
  586. if regGroupDist > m.regGroupDist {
  587. return
  588. }
  589. beaten = true
  590. }
  591. paradigmReg := isParadigmLocale(tag.LangID, have.maxRegion)
  592. if !beaten && m.paradigmReg != paradigmReg {
  593. if !paradigmReg {
  594. return
  595. }
  596. beaten = true
  597. }
  598. // Next we prefer if the pre-maximized script was specified and identical.
  599. origScript := have.tag.ScriptID == tag.ScriptID && tag.ScriptID != 0
  600. if !beaten && m.origScript != origScript {
  601. if m.origScript {
  602. return
  603. }
  604. beaten = true
  605. }
  606. // Update m to the newly found best match.
  607. if beaten {
  608. m.have = have
  609. m.want = tag
  610. m.conf = c
  611. m.pinnedRegion = maxRegion
  612. m.sameRegionGroup = sameGroup
  613. m.origLang = origLang
  614. m.origReg = origReg
  615. m.paradigmReg = paradigmReg
  616. m.origScript = origScript
  617. m.regGroupDist = regGroupDist
  618. }
  619. }
  620. func isParadigmLocale(lang language.Language, r language.Region) bool {
  621. for _, e := range paradigmLocales {
  622. if language.Language(e[0]) == lang && (r == language.Region(e[1]) || r == language.Region(e[2])) {
  623. return true
  624. }
  625. }
  626. return false
  627. }
  628. // regionGroupDist computes the distance between two regions based on their
  629. // CLDR grouping.
  630. func regionGroupDist(a, b language.Region, script language.Script, lang language.Language) (dist uint8, same bool) {
  631. const defaultDistance = 4
  632. aGroup := uint(regionToGroups[a]) << 1
  633. bGroup := uint(regionToGroups[b]) << 1
  634. for _, ri := range matchRegion {
  635. if language.Language(ri.lang) == lang && (ri.script == 0 || language.Script(ri.script) == script) {
  636. group := uint(1 << (ri.group &^ 0x80))
  637. if 0x80&ri.group == 0 {
  638. if aGroup&bGroup&group != 0 { // Both regions are in the group.
  639. return ri.distance, ri.distance == defaultDistance
  640. }
  641. } else {
  642. if (aGroup|bGroup)&group == 0 { // Both regions are not in the group.
  643. return ri.distance, ri.distance == defaultDistance
  644. }
  645. }
  646. }
  647. }
  648. return defaultDistance, true
  649. }
  650. // equalsRest compares everything except the language.
  651. func equalsRest(a, b language.Tag) bool {
  652. // TODO: don't include extensions in this comparison. To do this efficiently,
  653. // though, we should handle private tags separately.
  654. return a.ScriptID == b.ScriptID && a.RegionID == b.RegionID && a.VariantOrPrivateUseTags() == b.VariantOrPrivateUseTags()
  655. }
  656. // isExactEquivalent returns true if canonicalizing the language will not alter
  657. // the script or region of a tag.
  658. func isExactEquivalent(l language.Language) bool {
  659. for _, o := range notEquivalent {
  660. if o == l {
  661. return false
  662. }
  663. }
  664. return true
  665. }
  666. var notEquivalent []language.Language
  667. func init() {
  668. // Create a list of all languages for which canonicalization may alter the
  669. // script or region.
  670. for _, lm := range language.AliasMap {
  671. tag := language.Tag{LangID: language.Language(lm.From)}
  672. if tag, _ = canonicalize(All, tag); tag.ScriptID != 0 || tag.RegionID != 0 {
  673. notEquivalent = append(notEquivalent, language.Language(lm.From))
  674. }
  675. }
  676. // Maximize undefined regions of paradigm locales.
  677. for i, v := range paradigmLocales {
  678. t := language.Tag{LangID: language.Language(v[0])}
  679. max, _ := t.Maximize()
  680. if v[1] == 0 {
  681. paradigmLocales[i][1] = uint16(max.RegionID)
  682. }
  683. if v[2] == 0 {
  684. paradigmLocales[i][2] = uint16(max.RegionID)
  685. }
  686. }
  687. }