You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

605 lines
19 KiB

  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:generate go run gen.go -output tables.go
  5. package language
  6. // TODO: Remove above NOTE after:
  7. // - verifying that tables are dropped correctly (most notably matcher tables).
  8. import (
  9. "strings"
  10. "golang.org/x/text/internal/language"
  11. "golang.org/x/text/internal/language/compact"
  12. )
  13. // Tag represents a BCP 47 language tag. It is used to specify an instance of a
  14. // specific language or locale. All language tag values are guaranteed to be
  15. // well-formed.
  16. type Tag compact.Tag
  17. func makeTag(t language.Tag) (tag Tag) {
  18. return Tag(compact.Make(t))
  19. }
  20. func (t *Tag) tag() language.Tag {
  21. return (*compact.Tag)(t).Tag()
  22. }
  23. func (t *Tag) isCompact() bool {
  24. return (*compact.Tag)(t).IsCompact()
  25. }
  26. // TODO: improve performance.
  27. func (t *Tag) lang() language.Language { return t.tag().LangID }
  28. func (t *Tag) region() language.Region { return t.tag().RegionID }
  29. func (t *Tag) script() language.Script { return t.tag().ScriptID }
  30. // Make is a convenience wrapper for Parse that omits the error.
  31. // In case of an error, a sensible default is returned.
  32. func Make(s string) Tag {
  33. return Default.Make(s)
  34. }
  35. // Make is a convenience wrapper for c.Parse that omits the error.
  36. // In case of an error, a sensible default is returned.
  37. func (c CanonType) Make(s string) Tag {
  38. t, _ := c.Parse(s)
  39. return t
  40. }
  41. // Raw returns the raw base language, script and region, without making an
  42. // attempt to infer their values.
  43. func (t Tag) Raw() (b Base, s Script, r Region) {
  44. tt := t.tag()
  45. return Base{tt.LangID}, Script{tt.ScriptID}, Region{tt.RegionID}
  46. }
  47. // IsRoot returns true if t is equal to language "und".
  48. func (t Tag) IsRoot() bool {
  49. return compact.Tag(t).IsRoot()
  50. }
  51. // CanonType can be used to enable or disable various types of canonicalization.
  52. type CanonType int
  53. const (
  54. // Replace deprecated base languages with their preferred replacements.
  55. DeprecatedBase CanonType = 1 << iota
  56. // Replace deprecated scripts with their preferred replacements.
  57. DeprecatedScript
  58. // Replace deprecated regions with their preferred replacements.
  59. DeprecatedRegion
  60. // Remove redundant scripts.
  61. SuppressScript
  62. // Normalize legacy encodings. This includes legacy languages defined in
  63. // CLDR as well as bibliographic codes defined in ISO-639.
  64. Legacy
  65. // Map the dominant language of a macro language group to the macro language
  66. // subtag. For example cmn -> zh.
  67. Macro
  68. // The CLDR flag should be used if full compatibility with CLDR is required.
  69. // There are a few cases where language.Tag may differ from CLDR. To follow all
  70. // of CLDR's suggestions, use All|CLDR.
  71. CLDR
  72. // Raw can be used to Compose or Parse without Canonicalization.
  73. Raw CanonType = 0
  74. // Replace all deprecated tags with their preferred replacements.
  75. Deprecated = DeprecatedBase | DeprecatedScript | DeprecatedRegion
  76. // All canonicalizations recommended by BCP 47.
  77. BCP47 = Deprecated | SuppressScript
  78. // All canonicalizations.
  79. All = BCP47 | Legacy | Macro
  80. // Default is the canonicalization used by Parse, Make and Compose. To
  81. // preserve as much information as possible, canonicalizations that remove
  82. // potentially valuable information are not included. The Matcher is
  83. // designed to recognize similar tags that would be the same if
  84. // they were canonicalized using All.
  85. Default = Deprecated | Legacy
  86. canonLang = DeprecatedBase | Legacy | Macro
  87. // TODO: LikelyScript, LikelyRegion: suppress similar to ICU.
  88. )
  89. // canonicalize returns the canonicalized equivalent of the tag and
  90. // whether there was any change.
  91. func canonicalize(c CanonType, t language.Tag) (language.Tag, bool) {
  92. if c == Raw {
  93. return t, false
  94. }
  95. changed := false
  96. if c&SuppressScript != 0 {
  97. if t.LangID.SuppressScript() == t.ScriptID {
  98. t.ScriptID = 0
  99. changed = true
  100. }
  101. }
  102. if c&canonLang != 0 {
  103. for {
  104. if l, aliasType := t.LangID.Canonicalize(); l != t.LangID {
  105. switch aliasType {
  106. case language.Legacy:
  107. if c&Legacy != 0 {
  108. if t.LangID == _sh && t.ScriptID == 0 {
  109. t.ScriptID = _Latn
  110. }
  111. t.LangID = l
  112. changed = true
  113. }
  114. case language.Macro:
  115. if c&Macro != 0 {
  116. // We deviate here from CLDR. The mapping "nb" -> "no"
  117. // qualifies as a typical Macro language mapping. However,
  118. // for legacy reasons, CLDR maps "no", the macro language
  119. // code for Norwegian, to the dominant variant "nb". This
  120. // change is currently under consideration for CLDR as well.
  121. // See https://unicode.org/cldr/trac/ticket/2698 and also
  122. // https://unicode.org/cldr/trac/ticket/1790 for some of the
  123. // practical implications. TODO: this check could be removed
  124. // if CLDR adopts this change.
  125. if c&CLDR == 0 || t.LangID != _nb {
  126. changed = true
  127. t.LangID = l
  128. }
  129. }
  130. case language.Deprecated:
  131. if c&DeprecatedBase != 0 {
  132. if t.LangID == _mo && t.RegionID == 0 {
  133. t.RegionID = _MD
  134. }
  135. t.LangID = l
  136. changed = true
  137. // Other canonicalization types may still apply.
  138. continue
  139. }
  140. }
  141. } else if c&Legacy != 0 && t.LangID == _no && c&CLDR != 0 {
  142. t.LangID = _nb
  143. changed = true
  144. }
  145. break
  146. }
  147. }
  148. if c&DeprecatedScript != 0 {
  149. if t.ScriptID == _Qaai {
  150. changed = true
  151. t.ScriptID = _Zinh
  152. }
  153. }
  154. if c&DeprecatedRegion != 0 {
  155. if r := t.RegionID.Canonicalize(); r != t.RegionID {
  156. changed = true
  157. t.RegionID = r
  158. }
  159. }
  160. return t, changed
  161. }
  162. // Canonicalize returns the canonicalized equivalent of the tag.
  163. func (c CanonType) Canonicalize(t Tag) (Tag, error) {
  164. // First try fast path.
  165. if t.isCompact() {
  166. if _, changed := canonicalize(c, compact.Tag(t).Tag()); !changed {
  167. return t, nil
  168. }
  169. }
  170. // It is unlikely that one will canonicalize a tag after matching. So do
  171. // a slow but simple approach here.
  172. if tag, changed := canonicalize(c, t.tag()); changed {
  173. tag.RemakeString()
  174. return makeTag(tag), nil
  175. }
  176. return t, nil
  177. }
  178. // Confidence indicates the level of certainty for a given return value.
  179. // For example, Serbian may be written in Cyrillic or Latin script.
  180. // The confidence level indicates whether a value was explicitly specified,
  181. // whether it is typically the only possible value, or whether there is
  182. // an ambiguity.
  183. type Confidence int
  184. const (
  185. No Confidence = iota // full confidence that there was no match
  186. Low // most likely value picked out of a set of alternatives
  187. High // value is generally assumed to be the correct match
  188. Exact // exact match or explicitly specified value
  189. )
  190. var confName = []string{"No", "Low", "High", "Exact"}
  191. func (c Confidence) String() string {
  192. return confName[c]
  193. }
  194. // String returns the canonical string representation of the language tag.
  195. func (t Tag) String() string {
  196. return t.tag().String()
  197. }
  198. // MarshalText implements encoding.TextMarshaler.
  199. func (t Tag) MarshalText() (text []byte, err error) {
  200. return t.tag().MarshalText()
  201. }
  202. // UnmarshalText implements encoding.TextUnmarshaler.
  203. func (t *Tag) UnmarshalText(text []byte) error {
  204. var tag language.Tag
  205. err := tag.UnmarshalText(text)
  206. *t = makeTag(tag)
  207. return err
  208. }
  209. // Base returns the base language of the language tag. If the base language is
  210. // unspecified, an attempt will be made to infer it from the context.
  211. // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
  212. func (t Tag) Base() (Base, Confidence) {
  213. if b := t.lang(); b != 0 {
  214. return Base{b}, Exact
  215. }
  216. tt := t.tag()
  217. c := High
  218. if tt.ScriptID == 0 && !tt.RegionID.IsCountry() {
  219. c = Low
  220. }
  221. if tag, err := tt.Maximize(); err == nil && tag.LangID != 0 {
  222. return Base{tag.LangID}, c
  223. }
  224. return Base{0}, No
  225. }
  226. // Script infers the script for the language tag. If it was not explicitly given, it will infer
  227. // a most likely candidate.
  228. // If more than one script is commonly used for a language, the most likely one
  229. // is returned with a low confidence indication. For example, it returns (Cyrl, Low)
  230. // for Serbian.
  231. // If a script cannot be inferred (Zzzz, No) is returned. We do not use Zyyy (undetermined)
  232. // as one would suspect from the IANA registry for BCP 47. In a Unicode context Zyyy marks
  233. // common characters (like 1, 2, 3, '.', etc.) and is therefore more like multiple scripts.
  234. // See https://www.unicode.org/reports/tr24/#Values for more details. Zzzz is also used for
  235. // unknown value in CLDR. (Zzzz, Exact) is returned if Zzzz was explicitly specified.
  236. // Note that an inferred script is never guaranteed to be the correct one. Latin is
  237. // almost exclusively used for Afrikaans, but Arabic has been used for some texts
  238. // in the past. Also, the script that is commonly used may change over time.
  239. // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
  240. func (t Tag) Script() (Script, Confidence) {
  241. if scr := t.script(); scr != 0 {
  242. return Script{scr}, Exact
  243. }
  244. tt := t.tag()
  245. sc, c := language.Script(_Zzzz), No
  246. if scr := tt.LangID.SuppressScript(); scr != 0 {
  247. // Note: it is not always the case that a language with a suppress
  248. // script value is only written in one script (e.g. kk, ms, pa).
  249. if tt.RegionID == 0 {
  250. return Script{scr}, High
  251. }
  252. sc, c = scr, High
  253. }
  254. if tag, err := tt.Maximize(); err == nil {
  255. if tag.ScriptID != sc {
  256. sc, c = tag.ScriptID, Low
  257. }
  258. } else {
  259. tt, _ = canonicalize(Deprecated|Macro, tt)
  260. if tag, err := tt.Maximize(); err == nil && tag.ScriptID != sc {
  261. sc, c = tag.ScriptID, Low
  262. }
  263. }
  264. return Script{sc}, c
  265. }
  266. // Region returns the region for the language tag. If it was not explicitly given, it will
  267. // infer a most likely candidate from the context.
  268. // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
  269. func (t Tag) Region() (Region, Confidence) {
  270. if r := t.region(); r != 0 {
  271. return Region{r}, Exact
  272. }
  273. tt := t.tag()
  274. if tt, err := tt.Maximize(); err == nil {
  275. return Region{tt.RegionID}, Low // TODO: differentiate between high and low.
  276. }
  277. tt, _ = canonicalize(Deprecated|Macro, tt)
  278. if tag, err := tt.Maximize(); err == nil {
  279. return Region{tag.RegionID}, Low
  280. }
  281. return Region{_ZZ}, No // TODO: return world instead of undetermined?
  282. }
  283. // Variants returns the variants specified explicitly for this language tag.
  284. // or nil if no variant was specified.
  285. func (t Tag) Variants() []Variant {
  286. if !compact.Tag(t).MayHaveVariants() {
  287. return nil
  288. }
  289. v := []Variant{}
  290. x, str := "", t.tag().Variants()
  291. for str != "" {
  292. x, str = nextToken(str)
  293. v = append(v, Variant{x})
  294. }
  295. return v
  296. }
  297. // Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
  298. // specific language are substituted with fields from the parent language.
  299. // The parent for a language may change for newer versions of CLDR.
  300. //
  301. // Parent returns a tag for a less specific language that is mutually
  302. // intelligible or Und if there is no such language. This may not be the same as
  303. // simply stripping the last BCP 47 subtag. For instance, the parent of "zh-TW"
  304. // is "zh-Hant", and the parent of "zh-Hant" is "und".
  305. func (t Tag) Parent() Tag {
  306. return Tag(compact.Tag(t).Parent())
  307. }
  308. // returns token t and the rest of the string.
  309. func nextToken(s string) (t, tail string) {
  310. p := strings.Index(s[1:], "-")
  311. if p == -1 {
  312. return s[1:], ""
  313. }
  314. p++
  315. return s[1:p], s[p:]
  316. }
  317. // Extension is a single BCP 47 extension.
  318. type Extension struct {
  319. s string
  320. }
  321. // String returns the string representation of the extension, including the
  322. // type tag.
  323. func (e Extension) String() string {
  324. return e.s
  325. }
  326. // ParseExtension parses s as an extension and returns it on success.
  327. func ParseExtension(s string) (e Extension, err error) {
  328. ext, err := language.ParseExtension(s)
  329. return Extension{ext}, err
  330. }
  331. // Type returns the one-byte extension type of e. It returns 0 for the zero
  332. // exception.
  333. func (e Extension) Type() byte {
  334. if e.s == "" {
  335. return 0
  336. }
  337. return e.s[0]
  338. }
  339. // Tokens returns the list of tokens of e.
  340. func (e Extension) Tokens() []string {
  341. return strings.Split(e.s, "-")
  342. }
  343. // Extension returns the extension of type x for tag t. It will return
  344. // false for ok if t does not have the requested extension. The returned
  345. // extension will be invalid in this case.
  346. func (t Tag) Extension(x byte) (ext Extension, ok bool) {
  347. if !compact.Tag(t).MayHaveExtensions() {
  348. return Extension{}, false
  349. }
  350. e, ok := t.tag().Extension(x)
  351. return Extension{e}, ok
  352. }
  353. // Extensions returns all extensions of t.
  354. func (t Tag) Extensions() []Extension {
  355. if !compact.Tag(t).MayHaveExtensions() {
  356. return nil
  357. }
  358. e := []Extension{}
  359. for _, ext := range t.tag().Extensions() {
  360. e = append(e, Extension{ext})
  361. }
  362. return e
  363. }
  364. // TypeForKey returns the type associated with the given key, where key and type
  365. // are of the allowed values defined for the Unicode locale extension ('u') in
  366. // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
  367. // TypeForKey will traverse the inheritance chain to get the correct value.
  368. //
  369. // If there are multiple types associated with a key, only the first will be
  370. // returned. If there is no type associated with a key, it returns the empty
  371. // string.
  372. func (t Tag) TypeForKey(key string) string {
  373. if !compact.Tag(t).MayHaveExtensions() {
  374. if key != "rg" && key != "va" {
  375. return ""
  376. }
  377. }
  378. return t.tag().TypeForKey(key)
  379. }
  380. // SetTypeForKey returns a new Tag with the key set to type, where key and type
  381. // are of the allowed values defined for the Unicode locale extension ('u') in
  382. // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
  383. // An empty value removes an existing pair with the same key.
  384. func (t Tag) SetTypeForKey(key, value string) (Tag, error) {
  385. tt, err := t.tag().SetTypeForKey(key, value)
  386. return makeTag(tt), err
  387. }
  388. // NumCompactTags is the number of compact tags. The maximum tag is
  389. // NumCompactTags-1.
  390. const NumCompactTags = compact.NumCompactTags
  391. // CompactIndex returns an index, where 0 <= index < NumCompactTags, for tags
  392. // for which data exists in the text repository.The index will change over time
  393. // and should not be stored in persistent storage. If t does not match a compact
  394. // index, exact will be false and the compact index will be returned for the
  395. // first match after repeatedly taking the Parent of t.
  396. func CompactIndex(t Tag) (index int, exact bool) {
  397. id, exact := compact.LanguageID(compact.Tag(t))
  398. return int(id), exact
  399. }
  400. var root = language.Tag{}
  401. // Base is an ISO 639 language code, used for encoding the base language
  402. // of a language tag.
  403. type Base struct {
  404. langID language.Language
  405. }
  406. // ParseBase parses a 2- or 3-letter ISO 639 code.
  407. // It returns a ValueError if s is a well-formed but unknown language identifier
  408. // or another error if another error occurred.
  409. func ParseBase(s string) (Base, error) {
  410. l, err := language.ParseBase(s)
  411. return Base{l}, err
  412. }
  413. // String returns the BCP 47 representation of the base language.
  414. func (b Base) String() string {
  415. return b.langID.String()
  416. }
  417. // ISO3 returns the ISO 639-3 language code.
  418. func (b Base) ISO3() string {
  419. return b.langID.ISO3()
  420. }
  421. // IsPrivateUse reports whether this language code is reserved for private use.
  422. func (b Base) IsPrivateUse() bool {
  423. return b.langID.IsPrivateUse()
  424. }
  425. // Script is a 4-letter ISO 15924 code for representing scripts.
  426. // It is idiomatically represented in title case.
  427. type Script struct {
  428. scriptID language.Script
  429. }
  430. // ParseScript parses a 4-letter ISO 15924 code.
  431. // It returns a ValueError if s is a well-formed but unknown script identifier
  432. // or another error if another error occurred.
  433. func ParseScript(s string) (Script, error) {
  434. sc, err := language.ParseScript(s)
  435. return Script{sc}, err
  436. }
  437. // String returns the script code in title case.
  438. // It returns "Zzzz" for an unspecified script.
  439. func (s Script) String() string {
  440. return s.scriptID.String()
  441. }
  442. // IsPrivateUse reports whether this script code is reserved for private use.
  443. func (s Script) IsPrivateUse() bool {
  444. return s.scriptID.IsPrivateUse()
  445. }
  446. // Region is an ISO 3166-1 or UN M.49 code for representing countries and regions.
  447. type Region struct {
  448. regionID language.Region
  449. }
  450. // EncodeM49 returns the Region for the given UN M.49 code.
  451. // It returns an error if r is not a valid code.
  452. func EncodeM49(r int) (Region, error) {
  453. rid, err := language.EncodeM49(r)
  454. return Region{rid}, err
  455. }
  456. // ParseRegion parses a 2- or 3-letter ISO 3166-1 or a UN M.49 code.
  457. // It returns a ValueError if s is a well-formed but unknown region identifier
  458. // or another error if another error occurred.
  459. func ParseRegion(s string) (Region, error) {
  460. r, err := language.ParseRegion(s)
  461. return Region{r}, err
  462. }
  463. // String returns the BCP 47 representation for the region.
  464. // It returns "ZZ" for an unspecified region.
  465. func (r Region) String() string {
  466. return r.regionID.String()
  467. }
  468. // ISO3 returns the 3-letter ISO code of r.
  469. // Note that not all regions have a 3-letter ISO code.
  470. // In such cases this method returns "ZZZ".
  471. func (r Region) ISO3() string {
  472. return r.regionID.ISO3()
  473. }
  474. // M49 returns the UN M.49 encoding of r, or 0 if this encoding
  475. // is not defined for r.
  476. func (r Region) M49() int {
  477. return r.regionID.M49()
  478. }
  479. // IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This
  480. // may include private-use tags that are assigned by CLDR and used in this
  481. // implementation. So IsPrivateUse and IsCountry can be simultaneously true.
  482. func (r Region) IsPrivateUse() bool {
  483. return r.regionID.IsPrivateUse()
  484. }
  485. // IsCountry returns whether this region is a country or autonomous area. This
  486. // includes non-standard definitions from CLDR.
  487. func (r Region) IsCountry() bool {
  488. return r.regionID.IsCountry()
  489. }
  490. // IsGroup returns whether this region defines a collection of regions. This
  491. // includes non-standard definitions from CLDR.
  492. func (r Region) IsGroup() bool {
  493. return r.regionID.IsGroup()
  494. }
  495. // Contains returns whether Region c is contained by Region r. It returns true
  496. // if c == r.
  497. func (r Region) Contains(c Region) bool {
  498. return r.regionID.Contains(c.regionID)
  499. }
  500. // TLD returns the country code top-level domain (ccTLD). UK is returned for GB.
  501. // In all other cases it returns either the region itself or an error.
  502. //
  503. // This method may return an error for a region for which there exists a
  504. // canonical form with a ccTLD. To get that ccTLD canonicalize r first. The
  505. // region will already be canonicalized it was obtained from a Tag that was
  506. // obtained using any of the default methods.
  507. func (r Region) TLD() (Region, error) {
  508. tld, err := r.regionID.TLD()
  509. return Region{tld}, err
  510. }
  511. // Canonicalize returns the region or a possible replacement if the region is
  512. // deprecated. It will not return a replacement for deprecated regions that
  513. // are split into multiple regions.
  514. func (r Region) Canonicalize() Region {
  515. return Region{r.regionID.Canonicalize()}
  516. }
  517. // Variant represents a registered variant of a language as defined by BCP 47.
  518. type Variant struct {
  519. variant string
  520. }
  521. // ParseVariant parses and returns a Variant. An error is returned if s is not
  522. // a valid variant.
  523. func ParseVariant(s string) (Variant, error) {
  524. v, err := language.ParseVariant(s)
  525. return Variant{v.String()}, err
  526. }
  527. // String returns the string representation of the variant.
  528. func (v Variant) String() string {
  529. return v.variant
  530. }