From f230a41a16e942ebf4866167c1f306246ca847ae Mon Sep 17 00:00:00 2001 From: Pieter Viljoen Date: Fri, 26 Jun 2026 09:10:48 -0700 Subject: [PATCH 1/3] Add UN M.49 region containment support Resolve #175 - match a UN M.49 region group against a contained region, e.g. es-419 (Latin America) matches es-MX (Mexico). - Add UnM49Data sourced from the CLDR territoryContainment data, parsed with XmlReader to keep the library AOT compatible, generated and embedded following the existing dataset pattern. - Add LanguageLookup.IsMatch(prefix, tag, regionContainment) as an opt-in overload; the existing two argument IsMatch is unchanged. - Add LanguageLookup.ExpandRegion to expand a region into its containing UN M.49 groups. - Fix ValidateExtendedLanguage to require 3 alpha so a numeric region following the language parses as a region, e.g. es-419. - Restore previously trimmed control flow comments in the parser and lookup. - Update README and HISTORY, add UN M.49 references, bump version to 1.4. Co-Authored-By: Claude Opus 4.8 (1M context) --- HISTORY.md | 7 + LanguageData/unm49 | 5574 ++++++++++++++++++++++ LanguageData/unm49.json | 734 +++ LanguageTags/LanguageLookup.cs | 105 +- LanguageTags/LanguageSchema.cs | 1 + LanguageTags/LanguageTagParser.cs | 46 +- LanguageTags/UnM49Data.cs | 417 ++ LanguageTags/UnM49DataGen.cs | 569 +++ LanguageTagsCreate/CreateTagData.cs | 21 + LanguageTagsTests/LanguageLookupTests.cs | 47 + LanguageTagsTests/UnM49Tests.cs | 131 + README.md | 117 +- version.json | 2 +- 13 files changed, 7718 insertions(+), 53 deletions(-) create mode 100644 LanguageData/unm49 create mode 100644 LanguageData/unm49.json create mode 100644 LanguageTags/UnM49Data.cs create mode 100644 LanguageTags/UnM49DataGen.cs create mode 100644 LanguageTagsTests/UnM49Tests.cs diff --git a/HISTORY.md b/HISTORY.md index 1081db8..be3dc7c 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,13 @@ C# .NET library for ISO 639-2, ISO 639-3, RFC 5646 / BCP 47 language tags. ## Release History +- Version 1.4: + - Added UN M.49 region containment support sourced from Unicode CLDR. + - Added `LanguageLookup.IsMatch(prefix, tag, regionContainment)` so a UN M.49 region group matches a contained region, e.g. `es-419` matches `es-MX`. + - Added `LanguageLookup.ExpandRegion()` to expand a region into its containing UN M.49 groups. + - Fixed parsing of a numeric region following the language, e.g. `es-419` now parses `419` as a region not an extended language. +- Version 1.3: + - Dependency, codegen, CI, and project template maintenance. - Version 1.2: - Refactored the project to follow standard patterns used across other projects. - Added logging support configured through `LogOptions.SetFactory(ILoggerFactory)`. diff --git a/LanguageData/unm49 b/LanguageData/unm49 new file mode 100644 index 0000000..e26ef2f --- /dev/null +++ b/LanguageData/unm49 @@ -0,0 +1,5574 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + und + hu ja km ko mn si ta te vi yue zh + + + + Dutch official + At most 6% are not fluent in English + Precise data not available, added so Balinese script defaults to Balinese + While Cyrillic is customary, the vast majority of the population can read both.For languages not customarily written, the writing populiation is artificially set to 5% in the absence of better information. + The figure includes 'Vlaams' population from Ethnologue + It is estimated that Walloon is used actively by 10-20% of the total population of Wallonia or between 300,000 and 600,000 people. For languages not customarily written, the writing population is artificially set to 5% in the absence of better information. + Precise data not available + Arabic official, the figure is derived from literacy * lang pop + Spanish is the official language, only about 60-70% of the population speaks it at all ; + English official, 81% literacy; the figure is derived from literacy * lang pop + [missing] + Ethnologue: 350k in CAF + 1.6 million 2nd lang speakers + Corsican has been recognized as a language by the French government. Speakers also use French but many are not fluent in it. For languages not customarily written, the writing population is artificially set to 5% + English 1/5 of pop, used 1/5 of pop * literacy rate + Spanish official + Languedocien = Occitan 'Everyone speaks French as first or second language.' For languages not customarily written, the writing population is artificially set to 5% + 100k+ native, plus 1.5 mil 2nd lang speakers. For languages not customarily written, the writing population is artificially set to 5% in the absence of better information. + For languages not customarily written, the writing population is artificially set to 5% in the absence of better information. + English official; the figure is derived from literacy * lang pop + Regelmässig verwendete Sprachen - Percent of people that regularly use the language; literacy is mostly in standard German. For languages not customarily written, the writing population is artificially set to 5% in the absence of better information. + [missing] + Actually literacy in Nko writing unknown but historically they used the Latin script + English official, the figure is derived from literacy * lang pop + Actually literacy in Nko writing unknown + Some 99% of users are literate in French or German. For languages not customarily written, the writing population is artificially set to 5% in the absence of better information. + 2nd lang literacy 15-25% + Nearly all speakers are literate in a 2nd language. For languages not customarily written, the writing population is artificially set to 5% + Many minor langs; Portuguese official + In this and other sources, such as Ethnologue, there is no estimate for number of users. http://en.wikipedia.org/wiki/Filipino_language http://www.ethnologue.com/show_language.asp?code=fil + Most of the population uses Creole; see also http://www.country-studies.com/haiti/creole,-literacy,-and-education.html http://en.wikipedia.org/wiki/French_language#Haiti + [missing] + Official language, 37-77% literacy + Official language, used in some schools. + http://www.censusindia.net/cendat/datatable26.html + 25% of pop + - Icelandic official + says: All Jordanians, regardless of ethnicity or religion, speak Arabic, the official language of Jordan + English official; Kiribati widespread + [missing] + German official + 2020 Russian Census + 2022 Census + Used CIA literacy figure times population, added 'Vlaams' population + [missing] + 70,000 in 1991, 100,000 who understand it, but do not speak it ; ethnic pop 530,000 in 2002 + Melanesian pidgin in much of the country is lingua franca; English (official; but spoken by only 1%-2% of the population); 120 indigenous languages + English 20% + Lesotho English-using pop estimated at 5%, no figs available. Probably too low. + [missing] + Official language. Probably 2% of the population from East Timor worldwide can function in it + Ethnologue says 80k users of French. No other figures found yet, but this seems too low. + Russian 5.8%. + The figure is from Wikipedia article on English-speaking populations + Albanian 25.1% + English is an official language, not widely spoken + 42.6% of population + [missing] + 4mil 2nd lang speakers, 120k 1st lang, 20k monolinguals. English creole; 40-45% literacy. + A pidginizatino of Motu; 120k 2nd lang speakers, very few 1st lang. + English official on some islands, total 9.4% + http://astro.uchicago.edu/cara/vtour/mcmurdo/ http://www.usap.gov/videoclipsandmaps/mcmwebcam.cfm Winter population is listed. + 1.2mil 1st lang + 240k 2nd lang users, low literacy + Has rotating Norwegian population at Norvegia Station + http://www.mavicanet.com/directory/eng/2436.html + Uninhabited, barren, sub-Antarctic islands + [missing] + Figure for Hindi includes 2nd language users, India Census data. + [missing] + CIA Factbook entry on Kazakhstan + 50k Europeans, mostly French. The figure for writing population is derived from literacy * population, and may be too high. + The figure is from Wikipedia article on http://en.wikipedia.org/wiki/List_of_countries_by_English-speaking_population The figure is from Wikipedia article on English-speaking populations + [missing] + The figure is from Wikipedia article on English-speaking populations + Precise data not available -- listed with 2 speakers as a tie-breaker + CIA Factbook + [missing] + CIA Factbook. See also http://www.jsmp.minihub.org/Reports/jsmpreports/Language%20Report/LanguageReport(english).pdf + CIA Factbook. + The Tonga Chronicle is a government-owned newspaper... It publishes two editions, one in Tongan with a circulation of 5,000, and one in English with a circulation of 1,500; Writing pop figure shown for English is set to 30% of that for Tonga. + 96% bilingual in Turkish. + The Tuvaluan language is spoken by virtually everyone, while Gilbertese is spoken by some people on Nui. English is also an official language, but is not spoken in daily use. Writing pop set to 10% of Tuvalu. + English (official, primary language of commerce, administration, and higher education) + Ethnologue lists 1 million 2nd lang users of English; no other good figures found. + also: http://en.wikipedia.org/wiki/Bosnian_language + [missing] + 2021 Census, counting people who are fluent in the language + 5% writing pop estimated in absence of other data + [missing] + Crude estimate based on import partner data. + [missing] + More than 80 % of the total Thai population speaks the native Thai language. + [missing] + [missing] + [missing] + (= Tai Lu, Xishuangbanna Dai; New Tai Lue script) + [missing] + [missing] + Estimates Indian ethnic 44% ; see also http://en.wikipedia.org/wiki/Non-resident_Indian_and_Person_of_Indian_Origin and http://www.vanuatu.usp.ac.fj/paclangunit/English_South_Pacific.htm + [missing] + [missing] + [missing] + [missing] + Deva is the official script for sd in India; set to 55%. Arab, Guru, Khoj also used. + The lingua franca of 80% of the population + 2016 Census + and https://en.wikipedia.org/wiki/Mru_language + - More than 95% of Pakistanis can speak or understand Urdu as their second or third language + [missing] + http://www.stoletie.ru/vzglyad/derusifikacija_nabirajet_oboroty_934.htm + [missing] + [missing] + [missing] + [missing] + US 2005 census + [missing] + [missing] + CIA Factbook lists spoken language, the entry for Bokmål only on Svalbard and Jan Mayan is an assumption. + http://www.bfs.admin.ch/bfs/portal/de/index/infothek/lexikon/bienvenue___login/blank/zugang_lexikon.Document.62669.xls + No literacy figure available for English in Madagascar; newly adopted official language; 5% is an estimate. + - the script is an assumption, needs a reference + Latin script official, used 98.8% of pop * 10% for the usage figure + Latin script official, used 98.8% of pop * 90% for the usage figure + - five eastern provinces of the DRC are Swahili speaking. Nearly half the 66 million Congolese speak it. + [missing] + 2022 Census number of people in Ethnic group + [missing] + [missing] + - Most educated Kenyans are able to communicate fluently in Swahili, since it is a compulsory subject in school + [missing] + [missing] + 2019 Belarus Census + 2024 idioma hablado la mayor parte del tiempo + Organisation internationale de la Francophonie Meta-study. Data from 2012 and 2016 Eurostat studies on first and second language usage across Europe + - 90 percent of approximately 39 million Tanzanians speak Swahili + - Baganda generally don't speak Swahili, but it is in common use among the 25 million people elsewhere in the country, and is currently being implemented in schools nationwide (use 75% of Cpop for this figure) + [missing] + [missing] + Salminen, T. (2007). Europe and North Asia. In Encyclopedia of the world’s endangered languages (pp. 211-280). Routledge. + http://www.ofis-bzh.org/fr/langue_bretonne/chiffres_cles/index.php France blocks other languages in state schools; 1.4% attended Breton schools and 3% is estimated as family transmission rate + 15.8% of population + The 2008 estimate is ~2000 speakers due to revival efforts + [missing] + The great majority of Equatorial Guineans speak Spanish, especially those living in the capital, Malabo. Spanish has been an official language since 1844. + Hans literacy is unknown; set to 5% artificially pending better or official figures. + http://www.statemaster.com/encyclopedia/Balinese-language widely used; taught in school as a main lang + widely used in its cultural areas, often in Latin script + http://www.indianetzone.com/7/haryanvi.htm little literature mostly folksongs; writers use std Hindi; claim of 55% literacy + 2nd lang literacy 25-50%, taught formally + 5% writing pop estimated in absence of other data; literacy rate reported at 12% + 5% writing pop estimated in absence of other data; literacy rate reported at ~8% + No estimate available. + 5% writing pop estimated in absence of other data; Japanese is lingua franca here + [missing] + Data completely unknown for Hausa in Arabic in Nigeria + almost all speakers bilingual in English + Pop decline to ~1398 in 2009 + Low literacy, high linguistic diversity; English official (govt) but not widely spoken + [missing] + Spoken by 70% of population, assumed to use Arabic script in Pakistan + Reported to be (regional) official in Chuvashia, central Russia: taught at schools. However: http://cv.wikipedia.org/ Chuvash Wikipedia on-line. + [missing] + 2022 Belize Census + 'A lingua franca and a first language for 10% of the population but understood by 95%' http://en.wikipedia.org/wiki/Krio_language + Dutch is spoken as a mother tongue by about 60% of the Surinamese, while most others speak it as a second or third language. + main language of trade and comm. in Isan region, except ... media where it gives way to Thai; now largely an unwritten language. 10% writing pop estimated in absence of other data + - primarily written using an Arabic-derived alphabet + and https://islandstudies.com/files/2016/11/Guernsey-Herm-Sark.pdf - extrapolated GDP from per capita x population + understood by 10 million, perhaps. Figure is questionable writing pop artificially set to 5% see also: http://en.wikipedia.org/wiki/Low_German (understood by 10 million people, and native to about 3 million people all around northern Germany) + 2018 Census, counting both maternal and secondary language usage + See the 2006 language survey data for 2nd langs = Shimaore + 2018 Census, counting both maternal and secondary language usage. Co-official in Sacatepéquez + Common lingua franca, widely used. High literacy. + but subtracting 270,000 per https://en.wikipedia.org/wiki/Swiss_Italian + [missing] + [missing] + [missing] + 98.8% speak Spanish. Also, https://www.cia.gov/library/publications/the-world-factbook/geos/sp.html + [missing] + Colony of France but uninhabited + No indigenous inhabitants. http://en.wikipedia.org/wiki/British_Indian_Ocean_Territory + Many also use Swahili + Latin is not shown as being used, rather Arabic + Used in schools up to University. + Afrobarometer (R10 2024/2025) + No indigenous inhabitants. http://en.wikipedia.org/wiki/French_Southern_Territories + 2022 Census language spoken at home + Shows 50% literacy + Most also use Swahili with 50% literacy. Only 5% monolingual. + Most also use Swahili + [missing] + basically unihabited, officially ; http://www.census.gov/prod/cen2000/phc3-us-pt1.pdf + http://lanic.utexas.edu/project/tilan/reports/rtf359/bolivia1.html Spanish is the official language, only about 60-70% of the population speaks it at all ; + Spanish ""universal"", set to 98% + https://www.cia.gov/library/publications/the-world-factbook/geos/cs.html + [missing] + [missing] + Sirmauri (srx) Mahasui = Himachali, Pahari, Sirmouri, Sirmuri + - 14k reported as native. Taught as elective subject in grades 5-8; not widely spoken as primary communication. + [missing] + [missing] + [missing] + native speaker pop is low, ~6200; but is most widely spoken 2nd language + [missing] + [missing] + This number is the literacy rate (the number of speakers is actually smaller) + Europeans and their languages survey, page 7 + http://ec.europa.eu/public_opinion/archives/ebs/ebs_243_en.pdf Europeans and their languages survey, page 7 + [missing] + 1st lang literacy 8% + low literacy + percentage calculated from http://www.spanishcourses.info/Mains/SpanishSpoken_EN.htm , see also http://www.spanishseo.org/resources/worldwide-spanish-speaking-population + [missing] + www.amar.org.ir + - regional lang community status, taught in some schools + [missing] + http://en.wikipedia.org/wiki/Interlingua#Community Has a regular conf in Sweden, also Brazil; an auxiliary language with tiny population worldwide + This is base pop for """"""""""""""""""""""""""""""""fub"""""""""""""""""""""""""""""""" lang code; ff shows as a macrolanguage + [missing] + (could be higher if 2nd lang included; no data yet) + [missing] + [missing] + [missing] + pop 7k. Figure is questionable writing pop artificially set to 5% see also http://en.wikipedia.org/wiki/Lower_Sorbian + Tigrinya ethnic pop is about 60% + [missing] + English official in education, 36.1% 2000 census + no other info available for now + from 2018, LECLERC, Jacques. «Danemark» dans L’aménagement linguistique dans le monde, Québec, CEFAN, Université Laval + language also called Kamta in India + Modern use of Arabic (Jawi) seems to be minimal, but is co-official with ms; set to 5% for now. + [missing] + [missing] + [missing] + [missing] + - source for GDP + - source for GDP Level of English usage unclear, but official for govt and education + - estimate 90%of literate pop can use Arabic; Lpop = 99% + http://en.wikipedia.org/wiki/South_Sudan + [missing] + low literacy and >120 langs in country + (used lower figure) + 25-50% literacy + literacy 15-25% + 30% literacy + 2nd lang literacy 30% + 2nd lang literacy 25-50% + [missing] + protected minority, southern Jutland + etimate only based on literacy; no population data currently available + population figure from CLDR-17483 ticket + No Data Available at present. + co-official in South Tyrol + 2018 Census, counting both maternal and secondary language usage. Co-official in Quiché and Totonicapán + in Trieste and Gorizia + [missing] + Information on the Latin/Cyrillic script percentages for Montenegro not currently found. + most of population use Afrikaans commonly, about 89% literacy + CIA Factbook entry on Kazakhstan http://windowoneurasia2.blogspot.com/2013/12/window-on-eurasia-de-russianization.html http://www.stoletie.ru/vzglyad/derusifikacija_nabirajet_oboroty_934.htm http://www.stoletie.ru/vzglyad/derusifikacija_nabirajet_oboroty_934.htm + , Podlaskie Voivodeship + official in Vojvodina only + official in Vojvodina only; no pop data yet found + 2023 Census, Mothertongue + [missing] + https://www.cia.gov/library/publications/the-world-factbook/geos/uz.html Latin/Cyrillic balance is estimated, based on literacy; younger education now in Latin + Information on the Latin/Cyrillic script percentages for Kosovo not currently found. + Estimate based on 90% of literate pop > 15 years (71% of Cpop) can use English, for lack of official number of users + low litreracy ~5% + 2nd lang literacy 30% + http://en.wikipedia.org/wiki/Akademio_Internacia_de_la_Sciencoj_San_Marino - estimate 100% of the academy can use Esperanto; the language is used as 1st language of instruction; academy has 300 """"""""""""""""""""""""""""""""members"""""""""""""""""""""""""""""""". + recognized in West Java + Mainly unwritten + 2018 Census, counting both maternal and secondary language usage. Co-official in Quiché + Latin listed as being used (Scriptsource) but no pop figures available. + 2011 Census -- the language is not distinguished in the 2021 census + but no literacy data + Including 1st and 2nd lang speakers + [missing] + regional-official in part of Opole Voivodeship; in Poland 325 schools with primary instr in German, estimate 37000 students. Real figure probably higher. + Census figures cited there seem to put Armenian using pop between 50-75%. Using 50%. + 2020 Russian Census Spoken language, as first language or later acquired + unknown literacy + only 10% monolingual + near zero literacy; pop ~80000 (2009) see David Lawrence, Tanzania and its People, page 121, Google books + (baseline) + No population figure yet on use of Latin in Vatican. Estimate 100% of Vatican residents can use Latin. + 2010 Census: Widely Spoken Language of Communication + No figures available for this language. Estimating at 5%. + [missing] + [missing] + [missing] + - near-zero Azeri population in last census http://en.wikipedia.org/wiki/Azerbaijanis_in_Armenia#Current_situation + No figures available for breakdown of Latin vs. N'Ko for Bambara. The 2% figure is an estimate. + pop 13k. Figure is questionable writing pop artificially set to 5% see also http://en.wikipedia.org/wiki/Upper_Sorbian + French mostly used in commerce + Indonesia high literacy; low written use of local languages + - est 50% pop of Veneto area + 5% mainly spoken + [missing] + ​http://www.interlingua.com/statutos leading Interlingua assoc (Union Mundial pro Interlingua) registered French non-profit - real user pop figure is unknown but low + [missing] + Moribund language + [missing] + Estimated. See http://en.wikipedia.org/wiki/Emilian_language + Estimate not available. + [missing] + [missing] + [missing] + Also called Moré + Newly designated official, not so widely used + [missing] + [missing] + syr is a macrolang containing cld and aii) + [missing] + [missing] + [missing] + [missing] + [missing] + No hard figures for this yet, so this is a placeholder figure. + Widely spoken less written, and most speakers know standard German as well + [missing] + and https://www.ethnologue.com/language/yue + [missing] + [missing] + [missing] + [missing] + [missing] + [missing] + [missing] + Mainly in Guangdong Prov, ~70-80 million. Script unspecified so both listed + 2018 Census, counting both maternal and secondary language usage. Co-official in Chiquimula + Analyzed from 2011 UK census and other sources + 2018 Census, counting both maternal and secondary language usage. Co-official in Suchitepéquez + 2014 Maldives: 98% literacy in Divehi, 75% in English + [missing] + [missing] + Greek population in Russia -- most ancestrally used Pontic Greek -- modern usage almost certainly has dropped off but we don't have clear statistics on current usage. + [missing] + Lower estimate of Coptic population, actual language literacy unknown + [missing] + Organisation internationale de la Francophonie Meta-study. Data from 2013 Census + Organisation internationale de la Francophonie Meta-study. Data from IVQ survey in 2009 + 2021 Census Knowledge of Language + Organisation internationale de la Francophonie Meta-study. Data from 2014 study + Organisation internationale de la Francophonie Meta-study. Data from 1994 study + Organisation internationale de la Francophonie Meta-study. Data from 2009 and 2012 studies + Regelmässig verwendete Sprachen - Percent of people that regularly use the language + Latin alphabet usage for Kurdish also present but actual amount unknown + Organisation internationale de la Francophonie Meta-study. Data from 2014 census + Organisation internationale de la Francophonie Meta-study. Data from 2010 questionnaire + Organisation internationale de la Francophonie Meta-study. Data from 2008 Census + Organisation internationale de la Francophonie Meta-study. Data from 2017 survey from Gabon authorities + Organisation internationale de la Francophonie Meta-study. Data from 2011 IVQ survey + Organisation internationale de la Francophonie Meta-study. Data from 2005 Study + Organisation internationale de la Francophonie Meta-study. Data from 2012 Census + Organisation internationale de la Francophonie Meta-study. Data from 2014 questionnaire + Organisation internationale de la Francophonie Meta-study. Data from 2003 census + Organisation internationale de la Francophonie Meta-study. Data from 2012, mixed methods + Organisation internationale de la Francophonie Meta-study. Data from 2014 Census + Organisation internationale de la Francophonie Meta-study. Data from 2017 questionnaire + Organisation internationale de la Francophonie Meta-study. Data from 2018 census + Organisation internationale de la Francophonie Meta-study. Data from 2009 census + Organisation internationale de la Francophonie Meta-study. Data from IVQ survey in 2014 + Organisation internationale de la Francophonie Meta-study. Data from 2009 Census + Organisation internationale de la Francophonie Meta-study. Data from 2012 census + Organisation internationale de la Francophonie Meta-study. Data from 2017 Census + Organisation internationale de la Francophonie Meta-study. Data from 2018 Census + Organisation internationale de la Francophonie Meta-study. Data from IVQ survey in 2007 + Organisation internationale de la Francophonie Meta-study. Data from 2013 census. Literacy is based on the language of instruction + Organisation internationale de la Francophonie Meta-study. Data from 2014 + Organisation internationale de la Francophonie Meta-study. Data from 2010 census + Organisation internationale de la Francophonie Meta-study. Data from 2007 Census + 1998 SIL study, cited in Ethnologue + from Instituto Cervantes 2021 + from 2013 Honduras census + Canada 2021 Census language 'Knowledge of Language'; official status from Wikipedia Languages_of_Canada + Regis, Riccardo. 'Su pianificazione, standardizzazione, polinomia: due esempi' Zeitschrift für romanische Philologie, vol. 128, no. 1, 2012, pp. 88-133. + Number & script usage hard to pin down because of many speakers in contested Nagorno Karabakh region. + Latin alphabet usage also present but exact breakdown unknown + Cyrillic usage for Kurdish may no longer be as dominant but it used to be + citation from 2016 + 2026 citation + Citation from 2016 + [missing] + [missing] + [missing] + [missing] + [missing] + [missing] + Leclerc (2014) + [missing] + [missing] + 2011 estimate, people that speak it as their first, second or third language. http://www.censusindia.gov.in/2011census/C-17.html + 1994 estimate in https://www.ethnologue.com/country/mm/languages + [missing] + + diff --git a/LanguageData/unm49.json b/LanguageData/unm49.json new file mode 100644 index 0000000..f579990 --- /dev/null +++ b/LanguageData/unm49.json @@ -0,0 +1,734 @@ +{ + "RecordList": [ + { + "Code": "001", + "Contains": [ + "019", + "002", + "150", + "142", + "009" + ] + }, + { + "Code": "001", + "Contains": [ + "EU", + "EZ", + "UN" + ] + }, + { + "Code": "011", + "Contains": [ + "BF", + "BJ", + "CI", + "CV", + "GH", + "GM", + "GN", + "GW", + "LR", + "ML", + "MR", + "NE", + "NG", + "SH", + "SL", + "SN", + "TG" + ] + }, + { + "Code": "013", + "Contains": [ + "BZ", + "CR", + "GT", + "HN", + "MX", + "NI", + "PA", + "SV" + ] + }, + { + "Code": "014", + "Contains": [ + "BI", + "DJ", + "ER", + "ET", + "IO", + "KE", + "KM", + "MG", + "MU", + "MW", + "MZ", + "RE", + "RW", + "SC", + "SO", + "SS", + "TF", + "TZ", + "UG", + "YT", + "ZM", + "ZW" + ] + }, + { + "Code": "142", + "Contains": [ + "145", + "143", + "030", + "034", + "035" + ] + }, + { + "Code": "143", + "Contains": [ + "TM", + "TJ", + "KG", + "KZ", + "UZ" + ] + }, + { + "Code": "145", + "Contains": [ + "AE", + "AM", + "AZ", + "BH", + "CY", + "GE", + "IL", + "IQ", + "JO", + "KW", + "LB", + "OM", + "PS", + "QA", + "SA", + "SY", + "TR", + "YE" + ] + }, + { + "Code": "015", + "Contains": [ + "DZ", + "EG", + "EH", + "LY", + "MA", + "SD", + "TN", + "EA", + "IC" + ] + }, + { + "Code": "150", + "Contains": [ + "154", + "155", + "151", + "039" + ] + }, + { + "Code": "151", + "Contains": [ + "BG", + "BY", + "CZ", + "HU", + "MD", + "PL", + "RO", + "RU", + "SK", + "UA" + ] + }, + { + "Code": "154", + "Contains": [ + "GG", + "IM", + "JE", + "AX", + "DK", + "EE", + "FI", + "FO", + "GB", + "IE", + "IS", + "LT", + "LV", + "NO", + "SE", + "SJ", + "CQ" + ] + }, + { + "Code": "155", + "Contains": [ + "AT", + "BE", + "CH", + "DE", + "FR", + "LI", + "LU", + "MC", + "NL" + ] + }, + { + "Code": "017", + "Contains": [ + "AO", + "CD", + "CF", + "CG", + "CM", + "GA", + "GQ", + "ST", + "TD" + ] + }, + { + "Code": "018", + "Contains": [ + "BW", + "LS", + "NA", + "SZ", + "ZA" + ] + }, + { + "Code": "019", + "Contains": [ + "021", + "013", + "029", + "005" + ] + }, + { + "Code": "019", + "Contains": [ + "003", + "419" + ] + }, + { + "Code": "002", + "Contains": [ + "015", + "011", + "017", + "014", + "018" + ] + }, + { + "Code": "002", + "Contains": [ + "202" + ] + }, + { + "Code": "202", + "Contains": [ + "011", + "017", + "014", + "018" + ] + }, + { + "Code": "021", + "Contains": [ + "BM", + "CA", + "GL", + "PM", + "US" + ] + }, + { + "Code": "029", + "Contains": [ + "AG", + "AI", + "AW", + "BB", + "BL", + "BQ", + "BS", + "CU", + "CW", + "DM", + "DO", + "GD", + "GP", + "HT", + "JM", + "KN", + "KY", + "LC", + "MF", + "MQ", + "MS", + "PR", + "SX", + "TC", + "TT", + "VC", + "VG", + "VI" + ] + }, + { + "Code": "003", + "Contains": [ + "021", + "013", + "029" + ] + }, + { + "Code": "030", + "Contains": [ + "CN", + "HK", + "JP", + "KP", + "KR", + "MN", + "MO", + "TW" + ] + }, + { + "Code": "035", + "Contains": [ + "BN", + "ID", + "KH", + "LA", + "MM", + "MY", + "PH", + "SG", + "TH", + "TL", + "VN" + ] + }, + { + "Code": "039", + "Contains": [ + "AD", + "AL", + "BA", + "ES", + "GI", + "GR", + "HR", + "IT", + "ME", + "MK", + "MT", + "RS", + "PT", + "SI", + "SM", + "VA", + "XK" + ] + }, + { + "Code": "419", + "Contains": [ + "013", + "029", + "005" + ] + }, + { + "Code": "005", + "Contains": [ + "AR", + "BO", + "BR", + "BV", + "CL", + "CO", + "EC", + "FK", + "GF", + "GS", + "GY", + "PE", + "PY", + "SR", + "UY", + "VE" + ] + }, + { + "Code": "053", + "Contains": [ + "AU", + "CC", + "CX", + "HM", + "NF", + "NZ" + ] + }, + { + "Code": "054", + "Contains": [ + "FJ", + "NC", + "PG", + "SB", + "VU" + ] + }, + { + "Code": "057", + "Contains": [ + "FM", + "GU", + "KI", + "MH", + "MP", + "NR", + "PW", + "UM" + ] + }, + { + "Code": "061", + "Contains": [ + "AS", + "CK", + "NU", + "PF", + "PN", + "TK", + "TO", + "TV", + "WF", + "WS" + ] + }, + { + "Code": "034", + "Contains": [ + "AF", + "BD", + "BT", + "IN", + "IR", + "LK", + "MV", + "NP", + "PK" + ] + }, + { + "Code": "009", + "Contains": [ + "053", + "054", + "057", + "061", + "QO" + ] + }, + { + "Code": "QO", + "Contains": [ + "AQ", + "AC", + "CP", + "DG", + "TA" + ] + }, + { + "Code": "EU", + "Contains": [ + "AT", + "BE", + "CY", + "CZ", + "DE", + "DK", + "EE", + "ES", + "FI", + "FR", + "GR", + "HR", + "HU", + "IE", + "IT", + "LT", + "LU", + "LV", + "MT", + "NL", + "PL", + "PT", + "SE", + "SI", + "SK", + "BG", + "RO" + ] + }, + { + "Code": "EZ", + "Contains": [ + "AT", + "BE", + "CY", + "DE", + "EE", + "ES", + "FI", + "FR", + "GR", + "IE", + "IT", + "LT", + "LU", + "LV", + "MT", + "NL", + "PT", + "SI", + "SK" + ] + }, + { + "Code": "UN", + "Contains": [ + "AD", + "AE", + "AF", + "AG", + "AL", + "AM", + "AO", + "AR", + "AT", + "AU", + "AZ", + "BA", + "BB", + "BD", + "BE", + "BF", + "BG", + "BH", + "BI", + "BJ", + "BN", + "BO", + "BR", + "BS", + "BT", + "BW", + "BY", + "BZ", + "CA", + "CD", + "CF", + "CG", + "CH", + "CI", + "CL", + "CM", + "CN", + "CO", + "CR", + "CU", + "CV", + "CY", + "CZ", + "DE", + "DJ", + "DK", + "DM", + "DO", + "DZ", + "EC", + "EE", + "EG", + "ER", + "ES", + "ET", + "FI", + "FJ", + "FM", + "FR", + "GA", + "GB", + "GD", + "GE", + "GH", + "GM", + "GN", + "GQ", + "GR", + "GT", + "GW", + "GY", + "HN", + "HR", + "HT", + "HU", + "ID", + "IE", + "IL", + "IN", + "IQ", + "IR", + "IS", + "IT", + "JM", + "JO", + "JP", + "KE", + "KG", + "KH", + "KI", + "KM", + "KN", + "KP", + "KR", + "KW", + "KZ", + "LA", + "LB", + "LC", + "LI", + "LK", + "LR", + "LS", + "LT", + "LU", + "LV", + "LY", + "MA", + "MC", + "MD", + "ME", + "MG", + "MH", + "MK", + "ML", + "MM", + "MN", + "MR", + "MT", + "MU", + "MV", + "MX", + "MW", + "MY", + "MZ", + "NA", + "NE", + "NG", + "NI", + "NL", + "NO", + "NR", + "NP", + "NZ", + "OM", + "PA", + "PE", + "PG", + "PH", + "PK", + "PL", + "PT", + "PW", + "PY", + "QA", + "RO", + "RS", + "RU", + "RW", + "SA", + "SB", + "SC", + "SD", + "SE", + "SG", + "SI", + "SK", + "SL", + "SM", + "SN", + "SO", + "SR", + "SS", + "ST", + "SV", + "SY", + "SZ", + "TD", + "TG", + "TH", + "TJ", + "TL", + "TM", + "TN", + "TO", + "TR", + "TT", + "TV", + "TZ", + "UA", + "UG", + "US", + "UY", + "UZ", + "VC", + "VE", + "VN", + "VU", + "WS", + "YE", + "ZA", + "ZM", + "ZW" + ] + } + ] +} \ No newline at end of file diff --git a/LanguageTags/LanguageLookup.cs b/LanguageTags/LanguageLookup.cs index 3600cc1..289af5f 100644 --- a/LanguageTags/LanguageLookup.cs +++ b/LanguageTags/LanguageLookup.cs @@ -15,6 +15,7 @@ public sealed class LanguageLookup private readonly Iso6392Data _iso6392 = Iso6392Data.Create(); private readonly Iso6393Data _iso6393 = Iso6393Data.Create(); private readonly Rfc5646Data _rfc5646 = Rfc5646Data.Create(); + private readonly UnM49Data _unM49 = UnM49Data.Create(); private readonly List<(string ietf, string iso)> _overrides = []; private static CultureInfo? CreateCultureInfo(string languageTag) @@ -27,6 +28,7 @@ public sealed class LanguageLookup try { + // Get a CultureInfo representation CultureInfo cultureInfo = CultureInfo.GetCultureInfo(languageTag, true); // Make sure the culture was not custom created @@ -172,6 +174,7 @@ public string GetIsoFromIetf(string languageTag) Iso6393Record? iso6393 = _iso6393.Find(languageTag, false); if (iso6393 != null) { + // Return the Part 2B code return iso6393.Part2B!; } @@ -179,6 +182,7 @@ public string GetIsoFromIetf(string languageTag) Iso6392Record? iso6392 = _iso6392.Find(languageTag, false); if (iso6392 != null) { + // Return the Part 2B code return iso6392.Part2B!; } @@ -194,6 +198,7 @@ public string GetIsoFromIetf(string languageTag) iso6393 = _iso6393.Find(cultureInfo.ThreeLetterISOLanguageName, false); if (iso6393 != null) { + // Return the Part 2B code return iso6393.Part2B!; } @@ -208,7 +213,25 @@ public string GetIsoFromIetf(string languageTag) /// The language tag to test. /// true if the language tag matches or starts with the prefix; otherwise, false. /// Thrown when or is null. - public bool IsMatch(string prefix, string languageTag) + public bool IsMatch(string prefix, string languageTag) => IsMatch(prefix, languageTag, false); + + /// + /// Determines whether a language tag matches or starts with the specified prefix, optionally + /// treating a UN M.49 region group in the prefix as matching any contained region. + /// + /// + /// When is true and plain prefix matching fails, a prefix + /// with a UN M.49 region group (e.g. "es-419") matches a tag whose region is contained within + /// that group (e.g. "es-MX"). Matching is directional, the broad group in the prefix matches the + /// specific region in the tag, not the reverse. Note that "001" (World) contains every region, so + /// a prefix such as "es-001" matches any "es" tag with a region. + /// + /// The prefix to match against. + /// The language tag to test. + /// true to also match UN M.49 region containment; otherwise, false. + /// true if the language tag matches the prefix; otherwise, false. + /// Thrown when or is null. + public bool IsMatch(string prefix, string languageTag, bool regionContainment) { ArgumentNullException.ThrowIfNull(prefix); ArgumentNullException.ThrowIfNull(languageTag); @@ -228,6 +251,7 @@ public bool IsMatch(string prefix, string languageTag) // The tag matches the prefix exactly if (languageTag.Equals(prefix, StringComparison.OrdinalIgnoreCase)) { + // Exact match return true; } @@ -237,6 +261,7 @@ public bool IsMatch(string prefix, string languageTag) && languageTag[prefix.Length..].StartsWith('-') ) { + // Prefix match return true; } @@ -252,17 +277,95 @@ public bool IsMatch(string prefix, string languageTag) !string.Equals(languageTag, subtag.TagValue, StringComparison.OrdinalIgnoreCase) ) { + // Rematch languageTag = subtag.TagValue; continue; } } + // Fall back to UN M.49 region containment, e.g. es-419 matches es-MX + if (regionContainment && IsRegionContainmentMatch(originalPrefix, originalTag)) + { + return true; + } + // No match Log.LogPrefixMatchFailed(originalPrefix, originalTag); return false; } } + private bool IsRegionContainmentMatch(string prefix, string languageTag) + { + // Both tags must parse + LanguageTag? prefixTag = LanguageTag.Parse(prefix); + LanguageTag? candidateTag = LanguageTag.Parse(languageTag); + if (prefixTag == null || candidateTag == null) + { + return false; + } + + // The prefix region must be a UN M.49 group (3 digits) and the candidate must have a region + if ( + prefixTag.Region.Length != 3 + || !prefixTag.Region.All(char.IsAsciiDigit) + || string.IsNullOrEmpty(candidateTag.Region) + ) + { + return false; + } + + // The language portion must be the same, only the region differs + if ( + !prefixTag.Language.Equals(candidateTag.Language, StringComparison.OrdinalIgnoreCase) + || !prefixTag.ExtendedLanguage.Equals( + candidateTag.ExtendedLanguage, + StringComparison.OrdinalIgnoreCase + ) + || !prefixTag.Script.Equals(candidateTag.Script, StringComparison.OrdinalIgnoreCase) + ) + { + return false; + } + + // The candidate region must be contained within the prefix region group + return _unM49.Contains(prefixTag.Region, candidateTag.Region); + } + + /// + /// Expands the region of a language tag into the tag plus a variant for each containing UN M.49 group. + /// + /// + /// For example "es-MX" expands to "es-MX", "es-013", "es-419", "es-019", and "es-001". A tag with + /// no region, or one that cannot be parsed, yields only the original tag. The expanded tags can be + /// matched with plain string comparison without enabling region containment in . + /// + /// The language tag to expand. + /// The original tag followed by a region substituted variant for each containing group. + /// Thrown when is null. + public IEnumerable ExpandRegion(string languageTag) + { + ArgumentNullException.ThrowIfNull(languageTag); + + // Always include the original tag + List expanded = [languageTag]; + + // Parse and expand the region into its containing UN M.49 groups + LanguageTag? parsed = LanguageTag.Parse(languageTag); + if (parsed == null || string.IsNullOrEmpty(parsed.Region)) + { + return expanded; + } + + // Substitute each ancestor region, e.g. es-MX -> es-013, es-419, es-019, es-001 + foreach (string ancestor in _unM49.GetAncestors(parsed.Region)) + { + LanguageTag variant = new(parsed) { Region = ancestor }; + expanded.Add(variant.ToString()); + } + return expanded; + } + /// /// Determines if two language tags are equivalent (case-insensitive). /// diff --git a/LanguageTags/LanguageSchema.cs b/LanguageTags/LanguageSchema.cs index 995d171..b507611 100644 --- a/LanguageTags/LanguageSchema.cs +++ b/LanguageTags/LanguageSchema.cs @@ -35,4 +35,5 @@ internal static string GetCodeGenString(IEnumerable list) => [JsonSerializable(typeof(Iso6392Data))] [JsonSerializable(typeof(Iso6393Data))] [JsonSerializable(typeof(Rfc5646Data))] +[JsonSerializable(typeof(UnM49Data))] internal partial class LanguageJsonContext : JsonSerializerContext; diff --git a/LanguageTags/LanguageTagParser.cs b/LanguageTags/LanguageTagParser.cs index 7b0ddbb..40c4ce4 100644 --- a/LanguageTags/LanguageTagParser.cs +++ b/LanguageTags/LanguageTagParser.cs @@ -30,6 +30,7 @@ private string ParseGrandfathered(string languageTag) // Grandfathered and Redundant Registrations // https://www.rfc-editor.org/rfc/rfc5646#section-2.2.8 + // Search tag registry // Type = Grandfathered, Tag = i-navajo, PreferredValue = nv List recordList = [ @@ -46,21 +47,25 @@ .. _rfc5646.RecordList.Where(record => return recordList[0].PreferredValue!; } + // No match return languageTag; } private static void SetCase(LanguageTag languageTag) { + // Language lowercase if (!string.IsNullOrEmpty(languageTag.Language)) { languageTag.Language = languageTag.Language.ToLowerInvariant(); } + // Extended language lowercase if (!string.IsNullOrEmpty(languageTag.ExtendedLanguage)) { languageTag.ExtendedLanguage = languageTag.ExtendedLanguage.ToLowerInvariant(); } + // Script title case if (!string.IsNullOrEmpty(languageTag.Script)) { languageTag.Script = CultureInfo.InvariantCulture.TextInfo.ToTitleCase( @@ -68,28 +73,34 @@ private static void SetCase(LanguageTag languageTag) ); } + // Region uppercase if (!string.IsNullOrEmpty(languageTag.Region)) { languageTag.Region = languageTag.Region.ToUpperInvariant(); } + // Variants lowercase for (int i = 0; i < languageTag._variants.Count; i++) { languageTag._variants[i] = languageTag._variants[i].ToLowerInvariant(); } + // Extensions lowercase and normalize for (int i = 0; i < languageTag._extensions.Count; i++) { languageTag._extensions[i] = languageTag._extensions[i].Normalize(); } + // Private use lowercase and normalize languageTag.PrivateUse = languageTag.PrivateUse.Normalize(); } private static void Sort(LanguageTag languageTag) { + // Sort variants languageTag._variants.Sort(); + // Sort extensions by prefix languageTag._extensions.Sort((x, y) => x.Prefix.CompareTo(y.Prefix)); // Note: Extension tags and private use tags are already sorted by Normalize() @@ -125,12 +136,14 @@ private bool ParseLanguage() _languageTag.Language = _tagList[0]; _tagList.RemoveAt(0); + // Done return true; } private static bool ValidateExtendedLanguage(string tag) => - // 3 chars - !string.IsNullOrEmpty(tag) && tag.Length == 3; + // 3 alpha + // A 3 digit value is a UN M.49 region, not an extended language + !string.IsNullOrEmpty(tag) && tag.Length == 3 && tag.All(char.IsAsciiLetter); private bool ParseExtendedLanguage() { @@ -147,12 +160,14 @@ private bool ParseExtendedLanguage() // Language is 2 or 3 chars if (!ValidateExtendedLanguage(_tagList[0]) || _languageTag.Language.Length is < 2 or > 3) { + // Done return true; } _languageTag.ExtendedLanguage = _tagList[0]; _tagList.RemoveAt(0); + // Done return true; } @@ -174,12 +189,14 @@ private bool ParseScript() // Qaaa - Qabx reserved private use if (!ValidateScript(_tagList[0])) { + // Done return true; } _languageTag.Script = _tagList[0]; _tagList.RemoveAt(0); + // Done return true; } @@ -209,12 +226,14 @@ private bool ParseRegion() // 3 digit UN M.49 if (!ValidateRegion(_tagList[0])) { + // Done return true; } _languageTag.Region = _tagList[0]; _tagList.RemoveAt(0); + // Done return true; } @@ -245,6 +264,7 @@ private bool ParseVariant() // begin with digit 4 = 8 chars if (!ValidateVariant(_tagList[0])) { + // Done return true; } @@ -254,10 +274,12 @@ private bool ParseVariant() return false; } + // Add variant tag _languageTag._variants.Add(_tagList[0]); _tagList.RemoveAt(0); } + // Done return true; } @@ -289,6 +311,7 @@ private bool ParseExtension() // 1 char (not x) if (!ValidateExtensionPrefix(_tagList[0])) { + // Done return true; } @@ -320,6 +343,7 @@ private bool ParseExtension() return false; } + // Add extension tag extensionTags.Add(_tagList[0]); _tagList.RemoveAt(0); } @@ -330,9 +354,11 @@ private bool ParseExtension() return false; } + // Add extension tag _languageTag._extensions.Add(new ExtensionTag(prefix, extensionTags)); } + // Done return true; } @@ -356,6 +382,7 @@ private bool ParsePrivateUse() // x-[private]-[private] if (!ValidatePrivateUsePrefix(_tagList[0])) { + // Done return true; } @@ -377,6 +404,7 @@ private bool ParsePrivateUse() // Collect all private use tags List privateTags = []; + // Read all tags while (_tagList.Count > 0) { // 1 to 8 chars @@ -392,6 +420,7 @@ private bool ParsePrivateUse() return false; } + // Add private use tag privateTags.Add(_tagList[0]); _tagList.RemoveAt(0); } @@ -402,8 +431,10 @@ private bool ParsePrivateUse() return false; } + // Create private use tag _languageTag.PrivateUse = new PrivateUseTag(privateTags); + // Done return true; } @@ -426,6 +457,7 @@ private bool ParsePrivateUse() ["-" privateuse] */ + // Init _languageTag = new(); _tagList.Clear(); string originalTag = languageTag; @@ -447,6 +479,7 @@ private bool ParsePrivateUse() // Grandfathered languageTag = ParseGrandfathered(languageTag); + // Split by - _tagList.AddRange([.. languageTag.Split('-')]); if (_tagList.Count == 0) { @@ -712,6 +745,7 @@ .. _rfc5646.RecordList.Where(record => Log.LogNormalizedTag(originalTag, normalizedTag); } + // Done return normalizeTag; } @@ -794,6 +828,12 @@ internal static bool Validate(LanguageTag languageTag) } // No empty tags - return !string.IsNullOrEmpty(languageTag.ToString()); + if (string.IsNullOrEmpty(languageTag.ToString())) + { + return false; + } + + // Done + return true; } } diff --git a/LanguageTags/UnM49Data.cs b/LanguageTags/UnM49Data.cs new file mode 100644 index 0000000..552b177 --- /dev/null +++ b/LanguageTags/UnM49Data.cs @@ -0,0 +1,417 @@ +using System.Runtime.CompilerServices; +using System.Text.Json.Serialization; +using System.Xml; + +namespace ptr727.LanguageTags; + +/// +/// Provides access to UN M.49 region containment data. +/// +public sealed partial class UnM49Data +{ + internal const string DataUri = + "https://raw.githubusercontent.com/unicode-org/cldr/main/common/supplemental/supplementalData.xml"; + internal const string DataFileName = "unm49"; + + private readonly Lazy _logger = new(LogOptions.CreateLogger); + internal ILogger Log => _logger.Value; + + // Lazily built transitive ancestor index, code -> all containing group codes + private readonly Lazy>> _ancestorIndex; + + [JsonConstructor] + internal UnM49Data() => _ancestorIndex = new(BuildAncestorIndex); + + /// + /// Gets the collection of UN M.49 containment records. + /// + [JsonInclude] + public ImmutableArray RecordList { get; internal set; } = []; + + /// + /// Creates a instance from a data file asynchronously. + /// + /// The path to the data file. + /// The loaded . + /// Thrown when the file cannot be read. + /// Thrown when the file contains invalid data. + public static async Task FromDataAsync(string fileName) + { + UnM49Data unM49Data = new(); + await unM49Data.LoadDataAsync(fileName).ConfigureAwait(false); + return unM49Data; + } + + [System.Diagnostics.CodeAnalysis.SuppressMessage( + "Reliability", + "CA2007:Consider calling ConfigureAwait on the awaited task", + Justification = "https://github.com/dotnet/roslyn-analyzers/issues/7185" + )] + private async Task LoadDataAsync(string fileName) + { + // UN M.49 territory containment from the CLDR supplemental data + // https://github.com/unicode-org/cldr/blob/main/common/supplemental/supplementalData.xml + // + // + // + // Numeric types are UN M.49 region codes, 2 letter values are ISO 3166-1 country codes + // The 419 Latin America macro region we care about is a grouping="true" overlay, so + // grouping overlays must be kept, only status="deprecated" containment is skipped + + try + { + // Read group elements within the territoryContainment section + List recordList = []; + await using FileStream fileStream = new( + fileName, + FileMode.Open, + FileAccess.Read, + FileShare.Read, + 4096, + FileOptions.Asynchronous | FileOptions.SequentialScan + ); + + // XmlReader is AOT safe, do not use the reflection based XmlSerializer + // CLDR data references an external DTD, ignore it instead of resolving it + XmlReaderSettings settings = new() + { + Async = true, + DtdProcessing = DtdProcessing.Ignore, + XmlResolver = null, + IgnoreComments = true, + IgnoreWhitespace = true, + }; + using XmlReader reader = XmlReader.Create(fileStream, settings); + + // Only process group elements inside the territoryContainment section + bool inContainment = false; + while (await reader.ReadAsync().ConfigureAwait(false)) + { + // Enter the territoryContainment section + if ( + reader.NodeType == XmlNodeType.Element + && reader.Name.Equals("territoryContainment", StringComparison.Ordinal) + ) + { + inContainment = true; + continue; + } + + // The section is unique, stop reading the rest of the large file + if ( + reader.NodeType == XmlNodeType.EndElement + && reader.Name.Equals("territoryContainment", StringComparison.Ordinal) + ) + { + break; + } + + // Group element + if ( + !inContainment + || reader.NodeType != XmlNodeType.Element + || !reader.Name.Equals("group", StringComparison.Ordinal) + ) + { + continue; + } + + // Skip deprecated containment, keep canonical and grouping overlays + string? status = reader.GetAttribute("status"); + if (string.Equals(status, "deprecated", StringComparison.Ordinal)) + { + continue; + } + + // type is the parent code, contains is the space separated child codes + string? type = reader.GetAttribute("type"); + string? contains = reader.GetAttribute("contains"); + if (string.IsNullOrEmpty(type) || string.IsNullOrEmpty(contains)) + { + continue; + } + + // Populate record + recordList.Add( + new UnM49Record + { + Code = type, + Contains = + [ + .. contains.Split( + ' ', + StringSplitOptions.RemoveEmptyEntries + | StringSplitOptions.TrimEntries + ), + ], + } + ); + } + + if (recordList.Count == 0) + { + Log.LogDataLoadEmpty(nameof(UnM49Data), fileName); + throw new InvalidDataException($"No data found in UN M.49 file: {fileName}"); + } + + RecordList = [.. recordList]; + Log.LogDataLoaded(nameof(UnM49Data), fileName, RecordList.Length); + } + catch (Exception exception) + { + Log.LogDataLoadFailed(nameof(UnM49Data), fileName, exception); + throw; + } + } + + /// + /// Creates a instance from a JSON file asynchronously. + /// + /// The path to the JSON file. + /// The loaded . + /// Thrown when the file contains invalid data. + /// Thrown when the file cannot be read. + /// Thrown when the JSON is invalid. + [System.Diagnostics.CodeAnalysis.SuppressMessage( + "Reliability", + "CA2007:Consider calling ConfigureAwait on the awaited task", + Justification = "https://github.com/dotnet/roslyn-analyzers/issues/7185" + )] + public static async Task FromJsonAsync(string fileName) + { + ILogger logger = LogOptions.CreateLogger(); + try + { + await using FileStream fileStream = new( + fileName, + FileMode.Open, + FileAccess.Read, + FileShare.Read, + 4096, + FileOptions.Asynchronous | FileOptions.SequentialScan + ); + UnM49Data? data = await JsonSerializer + .DeserializeAsync(fileStream, LanguageJsonContext.Default.UnM49Data) + .ConfigureAwait(false); + if (data == null) + { + logger.LogDataLoadEmpty(nameof(UnM49Data), fileName); + throw new InvalidDataException($"No data found in UN M.49 file: {fileName}"); + } + + logger.LogDataLoaded(nameof(UnM49Data), fileName, data.RecordList.Length); + return data; + } + catch (Exception exception) + { + logger.LogDataLoadFailed(nameof(UnM49Data), fileName, exception); + throw; + } + } + + [System.Diagnostics.CodeAnalysis.SuppressMessage( + "Reliability", + "CA2007:Consider calling ConfigureAwait on the awaited task", + Justification = "https://github.com/dotnet/roslyn-analyzers/issues/7185" + )] + internal async Task SaveJsonAsync(string fileName) + { + await using FileStream fileStream = new( + fileName, + FileMode.Create, + FileAccess.Write, + FileShare.None, + 4096, + FileOptions.Asynchronous | FileOptions.SequentialScan + ); + await JsonSerializer + .SerializeAsync(fileStream, this, LanguageJsonContext.Default.UnM49Data) + .ConfigureAwait(false); + } + + internal async Task SaveCodeAsync(string fileName) + { + using StreamWriter writer = new( + new FileStream( + fileName, + FileMode.Create, + FileAccess.Write, + FileShare.None, + 4096, + FileOptions.Asynchronous | FileOptions.SequentialScan + ), + new UTF8Encoding(false) + ) + { + NewLine = "\r\n", + }; + + await WriteLineAsync("namespace ptr727.LanguageTags;"); + await WriteLineAsync(string.Empty); + await WriteLineAsync("/// "); + await WriteLineAsync("/// Provides access to UN M.49 region containment data."); + await WriteLineAsync("/// "); + await WriteLineAsync( + $"[System.CodeDom.Compiler.GeneratedCode(\"{typeof(UnM49Data).FullName}\", \"1.0\")]" + ); + await WriteLineAsync("public sealed partial class UnM49Data"); + await WriteLineAsync("{"); + await WriteLineAsync(" /// "); + await WriteLineAsync( + " /// Creates an instance loaded from the embedded UN M.49 dataset." + ); + await WriteLineAsync(" /// "); + await WriteLineAsync( + " /// The populated instance." + ); + await WriteLineAsync(" public static UnM49Data Create() =>"); + await WriteLineAsync(" new()"); + await WriteLineAsync(" {"); + await WriteLineAsync(" RecordList ="); + await WriteLineAsync(" ["); + + foreach (UnM49Record record in RecordList) + { + await WriteLineAsync(" new()"); + await WriteLineAsync(" {"); + await WriteLineAsync( + $" Code = {LanguageSchema.GetCodeGenString(record.Code)}," + ); + await WriteLineAsync( + $" Contains = {LanguageSchema.GetCodeGenString(record.Contains)}," + ); + await WriteLineAsync(" },"); + } + + await WriteLineAsync(" ],"); + await WriteLineAsync(" };"); + await WriteLineAsync("}"); + return; + + ConfiguredTaskAwaitable WriteLineAsync(string value) => + writer.WriteLineAsync(value).ConfigureAwait(false); + } + + /// + /// Finds a UN M.49 containment record by region code. + /// + /// The region code to search for (e.g. "419" or "013"). + /// The first matching , or null when no match is found. + public UnM49Record? Find(string code) + { + if (string.IsNullOrEmpty(code)) + { + Log.LogFindRecordNotFound(nameof(UnM49Data), code, false); + return null; + } + + // Find the matching containment group + UnM49Record? record = RecordList.FirstOrDefault(item => + !string.IsNullOrEmpty(item.Code) + && item.Code.Equals(code, StringComparison.OrdinalIgnoreCase) + ); + if (record != null) + { + Log.LogFindRecordFound(nameof(UnM49Data), code, false); + return record; + } + + // Not found + Log.LogFindRecordNotFound(nameof(UnM49Data), code, false); + return null; + } + + /// + /// Gets the transitive set of UN M.49 group codes that contain the specified code. + /// + /// + /// For a country code the result is its chain of containing regions, e.g. "MX" yields + /// "013" (Central America), "419" (Latin America and the Caribbean), "019" (Americas), and + /// "001" (World). The codes are returned nearest containing group first. + /// + /// The region or country code to resolve. + /// The ancestor group codes, or an empty list when the code is unknown. + public IReadOnlyList GetAncestors(string code) => + string.IsNullOrEmpty(code) ? [] + : _ancestorIndex.Value.TryGetValue(code, out ImmutableArray ancestors) ? ancestors + : []; + + /// + /// Determines whether a region or country code is contained within a UN M.49 group. + /// + /// The containing group code (e.g. "419"). + /// The region or country code to test (e.g. "MX"). + /// true when is transitively contained in ; otherwise, false. + public bool Contains(string groupCode, string code) => + !string.IsNullOrEmpty(groupCode) + && !string.IsNullOrEmpty(code) + && GetAncestors(code).Contains(groupCode, StringComparer.OrdinalIgnoreCase); + + private Dictionary> BuildAncestorIndex() + { + // Map each child code to its direct parent group codes + Dictionary> parents = new(StringComparer.OrdinalIgnoreCase); + HashSet allCodes = new(StringComparer.OrdinalIgnoreCase); + foreach (UnM49Record record in RecordList) + { + _ = allCodes.Add(record.Code); + foreach (string child in record.Contains) + { + _ = allCodes.Add(child); + if (!parents.TryGetValue(child, out List? list)) + { + list = []; + parents[child] = list; + } + if (!list.Contains(record.Code, StringComparer.OrdinalIgnoreCase)) + { + list.Add(record.Code); + } + } + } + + // Walk up the parent map to collect transitive ancestors, guarding against cycles + Dictionary> index = new(StringComparer.OrdinalIgnoreCase); + foreach (string code in allCodes) + { + List ordered = []; + HashSet visited = new(StringComparer.OrdinalIgnoreCase) { code }; + Queue queue = new(); + queue.Enqueue(code); + while (queue.Count > 0) + { + string current = queue.Dequeue(); + if (!parents.TryGetValue(current, out List? currentParents)) + { + continue; + } + foreach (string parent in currentParents) + { + if (visited.Add(parent)) + { + ordered.Add(parent); + queue.Enqueue(parent); + } + } + } + index[code] = [.. ordered]; + } + return index; + } +} + +/// +/// Represents a UN M.49 region containment record. +/// +public sealed record UnM49Record +{ + /// + /// Gets the region code (UN M.49 numeric code, e.g. "419" or "013"). + /// + public string Code { get; init; } = string.Empty; + + /// + /// Gets the codes directly contained by this region (region or ISO 3166-1 country codes). + /// + public ImmutableArray Contains { get; init; } = []; +} diff --git a/LanguageTags/UnM49DataGen.cs b/LanguageTags/UnM49DataGen.cs new file mode 100644 index 0000000..f87a8ec --- /dev/null +++ b/LanguageTags/UnM49DataGen.cs @@ -0,0 +1,569 @@ +namespace ptr727.LanguageTags; + +/// +/// Provides access to UN M.49 region containment data. +/// +[System.CodeDom.Compiler.GeneratedCode("ptr727.LanguageTags.UnM49Data", "1.0")] +public sealed partial class UnM49Data +{ + /// + /// Creates an instance loaded from the embedded UN M.49 dataset. + /// + /// The populated instance. + public static UnM49Data Create() => + new() + { + RecordList = + [ + new() { Code = "001", Contains = [@"019", @"002", @"150", @"142", @"009"] }, + new() { Code = "001", Contains = [@"EU", @"EZ", @"UN"] }, + new() + { + Code = "011", + Contains = + [ + @"BF", + @"BJ", + @"CI", + @"CV", + @"GH", + @"GM", + @"GN", + @"GW", + @"LR", + @"ML", + @"MR", + @"NE", + @"NG", + @"SH", + @"SL", + @"SN", + @"TG", + ], + }, + new() + { + Code = "013", + Contains = [@"BZ", @"CR", @"GT", @"HN", @"MX", @"NI", @"PA", @"SV"], + }, + new() + { + Code = "014", + Contains = + [ + @"BI", + @"DJ", + @"ER", + @"ET", + @"IO", + @"KE", + @"KM", + @"MG", + @"MU", + @"MW", + @"MZ", + @"RE", + @"RW", + @"SC", + @"SO", + @"SS", + @"TF", + @"TZ", + @"UG", + @"YT", + @"ZM", + @"ZW", + ], + }, + new() { Code = "142", Contains = [@"145", @"143", @"030", @"034", @"035"] }, + new() { Code = "143", Contains = [@"TM", @"TJ", @"KG", @"KZ", @"UZ"] }, + new() + { + Code = "145", + Contains = + [ + @"AE", + @"AM", + @"AZ", + @"BH", + @"CY", + @"GE", + @"IL", + @"IQ", + @"JO", + @"KW", + @"LB", + @"OM", + @"PS", + @"QA", + @"SA", + @"SY", + @"TR", + @"YE", + ], + }, + new() + { + Code = "015", + Contains = [@"DZ", @"EG", @"EH", @"LY", @"MA", @"SD", @"TN", @"EA", @"IC"], + }, + new() { Code = "150", Contains = [@"154", @"155", @"151", @"039"] }, + new() + { + Code = "151", + Contains = + [ + @"BG", + @"BY", + @"CZ", + @"HU", + @"MD", + @"PL", + @"RO", + @"RU", + @"SK", + @"UA", + ], + }, + new() + { + Code = "154", + Contains = + [ + @"GG", + @"IM", + @"JE", + @"AX", + @"DK", + @"EE", + @"FI", + @"FO", + @"GB", + @"IE", + @"IS", + @"LT", + @"LV", + @"NO", + @"SE", + @"SJ", + @"CQ", + ], + }, + new() + { + Code = "155", + Contains = [@"AT", @"BE", @"CH", @"DE", @"FR", @"LI", @"LU", @"MC", @"NL"], + }, + new() + { + Code = "017", + Contains = [@"AO", @"CD", @"CF", @"CG", @"CM", @"GA", @"GQ", @"ST", @"TD"], + }, + new() { Code = "018", Contains = [@"BW", @"LS", @"NA", @"SZ", @"ZA"] }, + new() { Code = "019", Contains = [@"021", @"013", @"029", @"005"] }, + new() { Code = "019", Contains = [@"003", @"419"] }, + new() { Code = "002", Contains = [@"015", @"011", @"017", @"014", @"018"] }, + new() { Code = "002", Contains = [@"202"] }, + new() { Code = "202", Contains = [@"011", @"017", @"014", @"018"] }, + new() { Code = "021", Contains = [@"BM", @"CA", @"GL", @"PM", @"US"] }, + new() + { + Code = "029", + Contains = + [ + @"AG", + @"AI", + @"AW", + @"BB", + @"BL", + @"BQ", + @"BS", + @"CU", + @"CW", + @"DM", + @"DO", + @"GD", + @"GP", + @"HT", + @"JM", + @"KN", + @"KY", + @"LC", + @"MF", + @"MQ", + @"MS", + @"PR", + @"SX", + @"TC", + @"TT", + @"VC", + @"VG", + @"VI", + ], + }, + new() { Code = "003", Contains = [@"021", @"013", @"029"] }, + new() + { + Code = "030", + Contains = [@"CN", @"HK", @"JP", @"KP", @"KR", @"MN", @"MO", @"TW"], + }, + new() + { + Code = "035", + Contains = + [ + @"BN", + @"ID", + @"KH", + @"LA", + @"MM", + @"MY", + @"PH", + @"SG", + @"TH", + @"TL", + @"VN", + ], + }, + new() + { + Code = "039", + Contains = + [ + @"AD", + @"AL", + @"BA", + @"ES", + @"GI", + @"GR", + @"HR", + @"IT", + @"ME", + @"MK", + @"MT", + @"RS", + @"PT", + @"SI", + @"SM", + @"VA", + @"XK", + ], + }, + new() { Code = "419", Contains = [@"013", @"029", @"005"] }, + new() + { + Code = "005", + Contains = + [ + @"AR", + @"BO", + @"BR", + @"BV", + @"CL", + @"CO", + @"EC", + @"FK", + @"GF", + @"GS", + @"GY", + @"PE", + @"PY", + @"SR", + @"UY", + @"VE", + ], + }, + new() { Code = "053", Contains = [@"AU", @"CC", @"CX", @"HM", @"NF", @"NZ"] }, + new() { Code = "054", Contains = [@"FJ", @"NC", @"PG", @"SB", @"VU"] }, + new() + { + Code = "057", + Contains = [@"FM", @"GU", @"KI", @"MH", @"MP", @"NR", @"PW", @"UM"], + }, + new() + { + Code = "061", + Contains = + [ + @"AS", + @"CK", + @"NU", + @"PF", + @"PN", + @"TK", + @"TO", + @"TV", + @"WF", + @"WS", + ], + }, + new() + { + Code = "034", + Contains = [@"AF", @"BD", @"BT", @"IN", @"IR", @"LK", @"MV", @"NP", @"PK"], + }, + new() { Code = "009", Contains = [@"053", @"054", @"057", @"061", @"QO"] }, + new() { Code = "QO", Contains = [@"AQ", @"AC", @"CP", @"DG", @"TA"] }, + new() + { + Code = "EU", + Contains = + [ + @"AT", + @"BE", + @"CY", + @"CZ", + @"DE", + @"DK", + @"EE", + @"ES", + @"FI", + @"FR", + @"GR", + @"HR", + @"HU", + @"IE", + @"IT", + @"LT", + @"LU", + @"LV", + @"MT", + @"NL", + @"PL", + @"PT", + @"SE", + @"SI", + @"SK", + @"BG", + @"RO", + ], + }, + new() + { + Code = "EZ", + Contains = + [ + @"AT", + @"BE", + @"CY", + @"DE", + @"EE", + @"ES", + @"FI", + @"FR", + @"GR", + @"IE", + @"IT", + @"LT", + @"LU", + @"LV", + @"MT", + @"NL", + @"PT", + @"SI", + @"SK", + ], + }, + new() + { + Code = "UN", + Contains = + [ + @"AD", + @"AE", + @"AF", + @"AG", + @"AL", + @"AM", + @"AO", + @"AR", + @"AT", + @"AU", + @"AZ", + @"BA", + @"BB", + @"BD", + @"BE", + @"BF", + @"BG", + @"BH", + @"BI", + @"BJ", + @"BN", + @"BO", + @"BR", + @"BS", + @"BT", + @"BW", + @"BY", + @"BZ", + @"CA", + @"CD", + @"CF", + @"CG", + @"CH", + @"CI", + @"CL", + @"CM", + @"CN", + @"CO", + @"CR", + @"CU", + @"CV", + @"CY", + @"CZ", + @"DE", + @"DJ", + @"DK", + @"DM", + @"DO", + @"DZ", + @"EC", + @"EE", + @"EG", + @"ER", + @"ES", + @"ET", + @"FI", + @"FJ", + @"FM", + @"FR", + @"GA", + @"GB", + @"GD", + @"GE", + @"GH", + @"GM", + @"GN", + @"GQ", + @"GR", + @"GT", + @"GW", + @"GY", + @"HN", + @"HR", + @"HT", + @"HU", + @"ID", + @"IE", + @"IL", + @"IN", + @"IQ", + @"IR", + @"IS", + @"IT", + @"JM", + @"JO", + @"JP", + @"KE", + @"KG", + @"KH", + @"KI", + @"KM", + @"KN", + @"KP", + @"KR", + @"KW", + @"KZ", + @"LA", + @"LB", + @"LC", + @"LI", + @"LK", + @"LR", + @"LS", + @"LT", + @"LU", + @"LV", + @"LY", + @"MA", + @"MC", + @"MD", + @"ME", + @"MG", + @"MH", + @"MK", + @"ML", + @"MM", + @"MN", + @"MR", + @"MT", + @"MU", + @"MV", + @"MX", + @"MW", + @"MY", + @"MZ", + @"NA", + @"NE", + @"NG", + @"NI", + @"NL", + @"NO", + @"NR", + @"NP", + @"NZ", + @"OM", + @"PA", + @"PE", + @"PG", + @"PH", + @"PK", + @"PL", + @"PT", + @"PW", + @"PY", + @"QA", + @"RO", + @"RS", + @"RU", + @"RW", + @"SA", + @"SB", + @"SC", + @"SD", + @"SE", + @"SG", + @"SI", + @"SK", + @"SL", + @"SM", + @"SN", + @"SO", + @"SR", + @"SS", + @"ST", + @"SV", + @"SY", + @"SZ", + @"TD", + @"TG", + @"TH", + @"TJ", + @"TL", + @"TM", + @"TN", + @"TO", + @"TR", + @"TT", + @"TV", + @"TZ", + @"UA", + @"UG", + @"US", + @"UY", + @"UZ", + @"VC", + @"VE", + @"VN", + @"VU", + @"WS", + @"YE", + @"ZA", + @"ZM", + @"ZW", + ], + }, + ], + }; +} diff --git a/LanguageTagsCreate/CreateTagData.cs b/LanguageTagsCreate/CreateTagData.cs index c0ab7a2..78fd33e 100644 --- a/LanguageTagsCreate/CreateTagData.cs +++ b/LanguageTagsCreate/CreateTagData.cs @@ -18,6 +18,10 @@ CancellationToken cancellationToken private string? _rfc5646DataFile; private string? _rfc5646JsonFile; private string? _rfc5646CodeFile; + private UnM49Data? _unM49; + private string? _unM49DataFile; + private string? _unM49JsonFile; + private string? _unM49CodeFile; internal async Task DownloadDataAsync() { @@ -39,6 +43,10 @@ await DownloadFileAsync(new Uri(Iso6393Data.DataUri), _iso6393DataFile) await DownloadFileAsync(new Uri(Rfc5646Data.DataUri), _rfc5646DataFile) .ConfigureAwait(false); + Log.Information("Downloading UN M.49 data ..."); + _unM49DataFile = Path.Combine(dataDirectory, UnM49Data.DataFileName); + await DownloadFileAsync(new Uri(UnM49Data.DataUri), _unM49DataFile).ConfigureAwait(false); + Log.Information("Language tag data files downloaded successfully."); } @@ -47,6 +55,7 @@ internal async Task CreateJsonDataAsync() ArgumentNullException.ThrowIfNull(_iso6392DataFile); ArgumentNullException.ThrowIfNull(_iso6393DataFile); ArgumentNullException.ThrowIfNull(_rfc5646DataFile); + ArgumentNullException.ThrowIfNull(_unM49DataFile); // Convert data files to JSON Log.Information("Converting data files to JSON ..."); @@ -69,6 +78,12 @@ internal async Task CreateJsonDataAsync() Log.Information("Writing RFC 5646 data to {JsonPath}", _rfc5646JsonFile); await _rfc5646.SaveJsonAsync(_rfc5646JsonFile).ConfigureAwait(false); + Log.Information("Converting UN M.49 data to JSON ..."); + _unM49 = await UnM49Data.FromDataAsync(_unM49DataFile).ConfigureAwait(false); + _unM49JsonFile = Path.Combine(dataDirectory, UnM49Data.DataFileName + ".json"); + Log.Information("Writing UN M.49 data to {JsonPath}", _unM49JsonFile); + await _unM49.SaveJsonAsync(_unM49JsonFile).ConfigureAwait(false); + Log.Information("Data files converted to JSON successfully."); } @@ -77,6 +92,7 @@ internal async Task GenerateCodeAsync() ArgumentNullException.ThrowIfNull(_iso6392); ArgumentNullException.ThrowIfNull(_iso6393); ArgumentNullException.ThrowIfNull(_rfc5646); + ArgumentNullException.ThrowIfNull(_unM49); // Generate code files Log.Information("Generating code files ..."); @@ -96,6 +112,11 @@ internal async Task GenerateCodeAsync() Log.Information("Writing RFC 5646 code to {CodePath}", _rfc5646CodeFile); await _rfc5646.SaveCodeAsync(_rfc5646CodeFile).ConfigureAwait(false); + Log.Information("Generating UN M.49 code ..."); + _unM49CodeFile = Path.Combine(codeDirectory, nameof(UnM49Data) + "Gen.cs"); + Log.Information("Writing UN M.49 code to {CodePath}", _unM49CodeFile); + await _unM49.SaveCodeAsync(_unM49CodeFile).ConfigureAwait(false); + Log.Information("Code files generated successfully."); } diff --git a/LanguageTagsTests/LanguageLookupTests.cs b/LanguageTagsTests/LanguageLookupTests.cs index 55c4d57..42f30f4 100644 --- a/LanguageTagsTests/LanguageLookupTests.cs +++ b/LanguageTagsTests/LanguageLookupTests.cs @@ -62,6 +62,53 @@ public void IsMatch(string prefix, string tag, bool match) _ = languageLookup.IsMatch(prefix, tag).Should().Be(match); } + [Theory] + [InlineData("es-419", "es-MX", true)] // Mexico is in Latin America + [InlineData("es-MX", "es-419", false)] // Directional, not the reverse + [InlineData("es-419", "es-ES", false)] // Spain is not in Latin America + [InlineData("es-005", "es-AR", true)] // Argentina is in South America + [InlineData("es-013", "es-MX", true)] // Mexico is in Central America + [InlineData("es-419", "es-013", true)] // Central America is within Latin America + [InlineData("es-419", "es-419", true)] // Identity still matches + [InlineData("es-419", "fr-MX", false)] // Language must match + [InlineData("es-001", "es-MX", true)] // World contains every region + [InlineData("en", "en-US", true)] // Plain matching still works + public void IsMatch_RegionContainment(string prefix, string tag, bool match) + { + LanguageLookup languageLookup = new(); + _ = languageLookup.IsMatch(prefix, tag, true).Should().Be(match); + } + + [Theory] + [InlineData("es-419", "es-MX")] // Containment is opt-in, plain matching does not expand regions + [InlineData("es-005", "es-AR")] + public void IsMatch_RegionContainment_Disabled_DoesNotMatch(string prefix, string tag) + { + LanguageLookup languageLookup = new(); + _ = languageLookup.IsMatch(prefix, tag).Should().BeFalse(); + _ = languageLookup.IsMatch(prefix, tag, false).Should().BeFalse(); + } + + [Fact] + public void ExpandRegion_Country_ReturnsContainmentChain() + { + LanguageLookup languageLookup = new(); + List expanded = [.. languageLookup.ExpandRegion("es-MX")]; + + // The original tag is always first, followed by the containing UN M.49 groups + _ = expanded[0].Should().Be("es-MX"); + _ = expanded.Should().Contain(["es-MX", "es-013", "es-419", "es-019", "es-001"]); + } + + [Theory] + [InlineData("es")] // No region to expand + [InlineData("es-MX-x-foo")] // Region present, private use preserved + public void ExpandRegion_AlwaysIncludesOriginal(string tag) + { + LanguageLookup languageLookup = new(); + _ = languageLookup.ExpandRegion(tag).Should().Contain(tag); + } + [Theory] [InlineData("en-US", "en-us", true)] [InlineData("en-US", "EN-US", true)] diff --git a/LanguageTagsTests/UnM49Tests.cs b/LanguageTagsTests/UnM49Tests.cs new file mode 100644 index 0000000..63c4929 --- /dev/null +++ b/LanguageTagsTests/UnM49Tests.cs @@ -0,0 +1,131 @@ +namespace ptr727.LanguageTags.Tests; + +public sealed class UnM49Tests : SingleInstanceFixture +{ + [Fact] + public void Create() + { + // Create full list of containment records + UnM49Data unM49 = UnM49Data.Create(); + _ = unM49.RecordList.Length.Should().BeGreaterThan(0); + } + + [Fact] + public async Task FromData() + { + UnM49Data unM49 = await UnM49Data.FromDataAsync(GetDataFilePath(UnM49Data.DataFileName)); + _ = unM49.Should().NotBeNull(); + _ = unM49.RecordList.Length.Should().BeGreaterThan(0); + } + + [Fact] + public async Task FromJson() + { + UnM49Data unM49 = await UnM49Data.FromJsonAsync( + GetDataFilePath(UnM49Data.DataFileName + ".json") + ); + _ = unM49.Should().NotBeNull(); + _ = unM49.RecordList.Length.Should().BeGreaterThan(0); + } + + [Fact] + public async Task Create_FromData_FromJson_RecordsMatch() + { + UnM49Data created = UnM49Data.Create(); + UnM49Data fromData = await UnM49Data.FromDataAsync(GetDataFilePath(UnM49Data.DataFileName)); + UnM49Data fromJson = await UnM49Data.FromJsonAsync( + GetDataFilePath(UnM49Data.DataFileName + ".json") + ); + + _ = created.RecordList.Length.Should().BeGreaterThan(0); + _ = fromData.RecordList.Length.Should().BeGreaterThan(0); + _ = fromJson.RecordList.Length.Should().BeGreaterThan(0); + + _ = fromData.RecordList.Should().BeEquivalentTo(created.RecordList); + _ = fromJson.RecordList.Should().BeEquivalentTo(created.RecordList); + } + + [Fact] + public async Task SaveJsonAsync_RoundTrip() + { + UnM49Data unM49 = UnM49Data.Create(); + _ = unM49.RecordList.Length.Should().BeGreaterThan(0); + + string tempFile = Path.Combine(Path.GetTempPath(), $"{Guid.NewGuid()}.json"); + try + { + await unM49.SaveJsonAsync(tempFile); + UnM49Data roundTrip = await UnM49Data.FromJsonAsync(tempFile); + _ = roundTrip.Should().NotBeNull(); + _ = roundTrip.RecordList.Length.Should().Be(unM49.RecordList.Length); + } + finally + { + if (File.Exists(tempFile)) + { + File.Delete(tempFile); + } + } + } + + [Theory] + [InlineData("419")] // Latin America and the Caribbean + [InlineData("013")] // Central America + [InlineData("001")] // World + public void Find_Pass(string code) + { + UnM49Data unM49 = UnM49Data.Create(); + UnM49Record? record = unM49.Find(code); + _ = record.Should().NotBeNull(); + _ = record.Code.Should().BeEquivalentTo(code); + } + + [Theory] + [InlineData("XX")] + [InlineData("999")] + public void Find_Fail(string code) + { + UnM49Data unM49 = UnM49Data.Create(); + UnM49Record? record = unM49.Find(code); + _ = record.Should().BeNull(); + } + + [Fact] + public void Find_Empty_ReturnsNull() + { + UnM49Data unM49 = UnM49Data.Create(); + _ = unM49.Find(string.Empty).Should().BeNull(); + } + + [Theory] + [InlineData("419", "MX", true)] // Mexico is in Latin America + [InlineData("013", "MX", true)] // Mexico is in Central America + [InlineData("005", "AR", true)] // Argentina is in South America + [InlineData("419", "013", true)] // Central America is within Latin America + [InlineData("001", "ZA", true)] // World contains every country + [InlineData("419", "ES", false)] // Spain is not in Latin America + [InlineData("419", "US", false)] // United States is not in Latin America + [InlineData("419", "419", false)] // A group does not contain itself + public void Contains(string groupCode, string code, bool contained) + { + UnM49Data unM49 = UnM49Data.Create(); + _ = unM49.Contains(groupCode, code).Should().Be(contained); + } + + [Fact] + public void GetAncestors_Country_ReturnsContainingGroups() + { + UnM49Data unM49 = UnM49Data.Create(); + IReadOnlyList ancestors = unM49.GetAncestors("MX"); + + // Mexico is nested under Central America, Latin America, the Americas, and the World + _ = ancestors.Should().Contain(["013", "419", "019", "001"]); + } + + [Fact] + public void GetAncestors_Unknown_ReturnsEmpty() + { + UnM49Data unM49 = UnM49Data.Create(); + _ = unM49.GetAncestors("ZZ").Should().BeEmpty(); + } +} diff --git a/README.md b/README.md index 4572b4a..9bcb7ea 100644 --- a/README.md +++ b/README.md @@ -22,17 +22,12 @@ C# .NET library for ISO 639-2, ISO 639-3, RFC 5646 / BCP 47 language tags. ### Release Notes -**Version: 1.2**: +**Version: 1.4**: **Summary**: -- Refactored the project to follow standard patterns used across other projects. -- Added logging support configured through `LogOptions.SetFactory(ILoggerFactory)`. - -> **⚠️ Breaking Changes**: -> -> - IO API's are async only, e.g. `LoadJson()` -> `async FromJsonAsync()`. -> - Collection instantiation follows the `From` pattern, e.g. `LoadData()` -> `FromDataAsync()`. +- Added UN M.49 region containment support sourced from Unicode CLDR. +- Added opt-in region containment matching, e.g. `es-419` matches `es-MX`, see [Tag Matching](#tag-matching). See [Release History](./HISTORY.md) for complete release notes and older versions. @@ -185,7 +180,7 @@ Tag matching can be used to select content based on preferred vs. available lang > **ℹ️ Examples**: > -> - HTTP [`Accept-Language`][acceptlanguage-link] and [`Content-Language`](https://www.rfc-editor.org/rfc/rfc9110.html#name-content-language). +> - HTTP [`Accept-Language`][acceptlanguage-link] and [`Content-Language`][contentlanguage-link]. > - Matroska media stream [`LanguageIETF Element`][matroskalanguage-link]. IETF language tags are in the form of: @@ -210,6 +205,26 @@ match = languageLookup.IsMatch("zha", "zh-Hans"); // false match = languageLookup.IsMatch("zh-Hant", "zh-Hans"); // false ``` +A [UN M.49][unm49-link] region in a tag is a numeric code for a group of countries, e.g. `419` is Latin America and the Caribbean which contains `MX` Mexico.\ +Set the optional `regionContainment` argument to `true` to match a region group prefix against any contained region.\ +Matching is directional, the broad group in the prefix matches the specific region in the tag, not the reverse. + +```csharp +LanguageLookup languageLookup = new(); +bool match = languageLookup.IsMatch("es-419", "es-MX", true); // true, Mexico is in Latin America +match = languageLookup.IsMatch("es-419", "es-ES", true); // false, Spain is not in Latin America +match = languageLookup.IsMatch("es-MX", "es-419", true); // false, not the reverse +match = languageLookup.IsMatch("es-419", "es-MX"); // false, containment is opt-in +``` + +Use `ExpandRegion()` to expand a tag region into the tag plus a variant for each containing UN M.49 group. + +```csharp +LanguageLookup languageLookup = new(); +IEnumerable expanded = languageLookup.ExpandRegion("es-MX"); +// "es-MX", and its containing groups e.g. "es-013", "es-419", "es-019", "es-001" +``` + ### Tag Builder The `LanguageTagBuilder` class supports fluent builder style tag construction, and will return a constructed `LanguageTag` class through the final `Build()` or `Normalize()` methods. @@ -412,9 +427,10 @@ LogOptions.SetFactory(loggerFactory); - Converts the tag data into JSON files. - Generates C# records of the tags. - **[`LanguageData`](./LanguageData/) directory**: - - ISO 639-2: [Source](https://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt), [Data](./LanguageData/iso6392), [JSON](./LanguageData/iso6392.json), [Code](./LanguageTags/Iso6392DataGen.cs) - - ISO 639-3: [Source](https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab), [Data](./LanguageData/iso6393), [JSON](./LanguageData/iso6393.json), [Code](./LanguageTags/Iso6393DataGen.cs) - - RFC 5646 : [Source](https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry), [Data](./LanguageData/rfc5646), [JSON](./LanguageData/rfc5646.json), [Code](./LanguageTags/Rfc5646DataGen.cs) + - ISO 639-2: [Source][iso6392source-link], [Data](./LanguageData/iso6392), [JSON](./LanguageData/iso6392.json), [Code](./LanguageTags/Iso6392DataGen.cs) + - ISO 639-3: [Source][iso6393source-link], [Data](./LanguageData/iso6393), [JSON](./LanguageData/iso6393.json), [Code](./LanguageTags/Iso6393DataGen.cs) + - RFC 5646 : [Source][rfc5646source-link], [Data](./LanguageData/rfc5646), [JSON](./LanguageData/rfc5646.json), [Code](./LanguageTags/Rfc5646DataGen.cs) + - UN M.49 : [Source][unm49source-link], [Data](./LanguageData/unm49), [JSON](./LanguageData/unm49.json), [Code](./LanguageTags/UnM49DataGen.cs) - A daily [GitHub Actions](./.github/workflows/run-periodic-codegen-pull-request.yml) job opens PRs to keep the data files up to date; a [weekly scheduled job](./.github/workflows/publish-release.yml) publishes new releases. Routine merges (Dependabot, codegen) only smoke-test — the actual build/publish is batched into the weekly run (two-phase model). ## Contributing @@ -532,6 +548,7 @@ Both rulesets require the `Check pull request workflow status` status check and - [RFC : 4647 : Matching of Language Tags][rfc4647-link] - [RFC : 5646 : Tags for Identifying Languages][rfc5646-link] - [Unicode Consortium : Unicode Common Locale Data Repository (CLDR) Project][cldr-link] +- [UN Statistics Division : Standard Country or Area Codes (M.49)][unm49-link] - [Library of Congress : ISO 639-2 Language Coding Agency][iso6392-link] - [SIL International : ISO 639-3 Language Coding Agency][iso6393-link] @@ -568,27 +585,23 @@ Both rulesets require the `Check pull request workflow status` status check and Licensed under the [MIT License][license-link]\ ![GitHub License][license-shield] - + -[github-link]: https://github.com/ptr727/LanguageTags [actions-link]: https://github.com/ptr727/LanguageTags/actions -[discussions-link]: https://github.com/ptr727/LanguageTags/discussions [commits-link]: https://github.com/ptr727/LanguageTags/commits/main +[discussions-link]: https://github.com/ptr727/LanguageTags/discussions +[github-link]: https://github.com/ptr727/LanguageTags [issues-link]: https://github.com/ptr727/LanguageTags/issues -[releases-link]: https://github.com/ptr727/LanguageTags/releases - -[license-link]: ./LICENSE -[license-shield]: https://img.shields.io/github/license/ptr727/LanguageTags?label=License - [lastbuild-shield]: https://byob.yarr.is/ptr727/LanguageTags/lastbuild [lastcommit-shield]: https://img.shields.io/github/last-commit/ptr727/LanguageTags?logo=github&label=Last%20Commit - -[releaseversion-shield]: https://img.shields.io/github/v/release/ptr727/LanguageTags?logo=github&label=GitHub%20Release -[prereleaseversion-shield]: https://img.shields.io/github/v/release/ptr727/LanguageTags?include_prereleases&filter=*-g*&label=GitHub%20Pre-Release&logo=github -[releasebuildstatus-shield]: https://img.shields.io/github/actions/workflow/status/ptr727/LanguageTags/publish-release.yml?logo=github&label=Releases%20Build&event=schedule - +[license-link]: ./LICENSE +[license-shield]: https://img.shields.io/github/license/ptr727/LanguageTags?label=License [nuget-link]: https://www.nuget.org/packages/ptr727.LanguageTags/ [nugetreleaseversion-shield]: https://img.shields.io/nuget/v/ptr727.LanguageTags?logo=nuget&label=NuGet%20Release +[prereleaseversion-shield]: https://img.shields.io/github/v/release/ptr727/LanguageTags?include_prereleases&filter=*-g*&label=GitHub%20Pre-Release&logo=github +[releasebuildstatus-shield]: https://img.shields.io/github/actions/workflow/status/ptr727/LanguageTags/publish-release.yml?logo=github&label=Releases%20Build&event=schedule +[releases-link]: https://github.com/ptr727/LanguageTags/releases +[releaseversion-shield]: https://img.shields.io/github/v/release/ptr727/LanguageTags?logo=github&label=GitHub%20Release @@ -605,39 +618,47 @@ Licensed under the [MIT License][license-link]\ [serilog-link]: https://serilog.net/ [xunit-link]: https://xunit.net/ + + +[iso6392source-link]: https://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt +[iso6393source-link]: https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab +[rfc5646source-link]: https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry +[unm49source-link]: https://raw.githubusercontent.com/unicode-org/cldr/main/common/supplemental/supplementalData.xml + +[acceptlanguage-link]: https://www.rfc-editor.org/rfc/rfc9110.html#name-accept-language +[bcp47-link]: https://www.rfc-editor.org/info/bcp47 +[cldr-link]: https://cldr.unicode.org/ +[contentlanguage-link]: https://www.rfc-editor.org/rfc/rfc9110.html#name-content-language +[dansmithlanguagetagssharp-link]: https://github.com/DanSmith/languagetags-sharp +[ianatags-link]: https://www.iana.org/assignments/language-subtags-tags-extensions/language-subtags-tags-extensions.xhtml +[ietflanguagetag-link]: https://en.wikipedia.org/wiki/IETF_language_tag +[iso15924-link]: https://unicode.org/iso15924/iso15924-codes.html +[iso31661-link]: https://en.wikipedia.org/wiki/ISO_3166-1 +[iso6392-link]: https://www.loc.gov/standards/iso639-2/ +[iso6393-link]: https://iso639-3.sil.org/ +[jkporterbcp47-link]: https://github.com/jkporter/bcp47 +[matroskalanguage-link]: https://datatracker.ietf.org/doc/html/draft-ietf-cellar-matroska-07#name-language-codes +[mattcglanguagesubtagregistry-link]: https://github.com/mattcg/language-subtag-registry +[oxigraphoxilangtag-link]: https://github.com/oxigraph/oxilangtag +[pyfischrustlanguagetags-link]: https://github.com/pyfisch/rust-language-tags/ +[r12asubtags-link]: https://r12a.github.io/app-subtags/ +[rfc4647-link]: https://www.rfc-editor.org/info/rfc4647 +[rfc5646-link]: https://www.rfc-editor.org/info/rfc5646 [rfc5646section21-link]: https://www.rfc-editor.org/rfc/rfc5646#section-2.1 [rfc5646section221-link]: https://www.rfc-editor.org/rfc/rfc5646#section-2.2.1 [rfc5646section222-link]: https://www.rfc-editor.org/rfc/rfc5646#section-2.2.2 [rfc5646section223-link]: https://www.rfc-editor.org/rfc/rfc5646#section-2.2.3 -[iso15924-link]: https://unicode.org/iso15924/iso15924-codes.html [rfc5646section224-link]: https://www.rfc-editor.org/rfc/rfc5646#section-2.2.4 -[iso31661-link]: https://en.wikipedia.org/wiki/ISO_3166-1 -[unm49-link]: https://unstats.un.org/unsd/methodology/m49/ [rfc5646section225-link]: https://www.rfc-editor.org/rfc/rfc5646#section-2.2.5 [rfc5646section226-link]: https://www.rfc-editor.org/rfc/rfc5646#section-2.2.6 [rfc5646section227-link]: https://www.rfc-editor.org/rfc/rfc5646#section-2.2.7 [rfc5646section228-link]: https://www.rfc-editor.org/rfc/rfc5646#section-2.2.8 -[r12asubtags-link]: https://r12a.github.io/app-subtags/ -[wikipediacodes-link]: https://en.wikipedia.org/wiki/Codes_for_constructed_languages -[ietflanguagetag-link]: https://en.wikipedia.org/wiki/IETF_language_tag -[w3cchoosingtag-link]: https://www.w3.org/International/questions/qa-choosing-language-tags -[w3ctags-link]: https://www.w3.org/International/articles/language-tags/ -[ianatags-link]: https://www.iana.org/assignments/language-subtags-tags-extensions/language-subtags-tags-extensions.xhtml -[rfc4647-link]: https://www.rfc-editor.org/info/rfc4647 -[rfc5646-link]: https://www.rfc-editor.org/info/rfc5646 -[iso6392-link]: https://www.loc.gov/standards/iso639-2/ -[cldr-link]: https://cldr.unicode.org/ -[iso6393-link]: https://iso639-3.sil.org/ -[bcp47-link]: https://www.rfc-editor.org/info/bcp47 -[rspeerlangcodes-link]: https://github.com/rspeer/langcodes -[oxigraphoxilangtag-link]: https://github.com/oxigraph/oxilangtag -[pyfischrustlanguagetags-link]: https://github.com/pyfisch/rust-language-tags/ -[dansmithlanguagetagssharp-link]: https://github.com/DanSmith/languagetags-sharp -[jkporterbcp47-link]: https://github.com/jkporter/bcp47 -[mattcglanguagesubtagregistry-link]: https://github.com/mattcg/language-subtag-registry [rfc5646section229-link]: https://www.rfc-editor.org/rfc/rfc5646#section-2.2.9 -[acceptlanguage-link]: https://www.rfc-editor.org/rfc/rfc9110.html#name-accept-language -[matroskalanguage-link]: https://datatracker.ietf.org/doc/html/draft-ietf-cellar-matroska-07#name-language-codes [rfc5646section45-link]: https://www.rfc-editor.org/rfc/rfc5646#section-4.5 +[rspeerlangcodes-link]: https://github.com/rspeer/langcodes +[unm49-link]: https://unstats.un.org/unsd/methodology/m49/ +[w3cchoosingtag-link]: https://www.w3.org/International/questions/qa-choosing-language-tags +[w3ctags-link]: https://www.w3.org/International/articles/language-tags/ +[wikipediacodes-link]: https://en.wikipedia.org/wiki/Codes_for_constructed_languages diff --git a/version.json b/version.json index a086ff0..389818b 100644 --- a/version.json +++ b/version.json @@ -1,6 +1,6 @@ { "$schema": "https://raw.githubusercontent.com/dotnet/Nerdbank.GitVersioning/master/src/NerdBank.GitVersioning/version.schema.json", - "version": "1.3", + "version": "1.4", "publicReleaseRefSpec": [ "^refs/heads/main$" ], From e616b9d77a672566fc7fda85e52ed4cfec4136e2 Mon Sep 17 00:00:00 2001 From: Pieter Viljoen Date: Fri, 26 Jun 2026 09:26:53 -0700 Subject: [PATCH 2/3] Address Copilot review feedback - Region containment matching now substitutes the candidate region and reuses the plain matcher, preserving variant, extension, and private use semantics to avoid false positives, e.g. es-419-nedis no longer matches es-MX while es-419 still matches es-MX-nedis. - Clarify UnM49Data.Find returns the first of possibly multiple records for a code, and point to GetAncestors/Contains for full transitive containment. - Clarify UnM49Record.Code may be an alphabetic CLDR grouping code (EU, EZ, UN). - Add region containment tests for script preservation and prefix variants. Co-Authored-By: Claude Opus 4.8 (1M context) --- LanguageTags/LanguageLookup.cs | 38 ++++++++---------------- LanguageTags/UnM49Data.cs | 8 ++++- LanguageTagsTests/LanguageLookupTests.cs | 3 ++ 3 files changed, 23 insertions(+), 26 deletions(-) diff --git a/LanguageTags/LanguageLookup.cs b/LanguageTags/LanguageLookup.cs index 289af5f..56a79ca 100644 --- a/LanguageTags/LanguageLookup.cs +++ b/LanguageTags/LanguageLookup.cs @@ -297,39 +297,27 @@ public bool IsMatch(string prefix, string languageTag, bool regionContainment) private bool IsRegionContainmentMatch(string prefix, string languageTag) { - // Both tags must parse - LanguageTag? prefixTag = LanguageTag.Parse(prefix); + // The candidate must parse and have a region to expand LanguageTag? candidateTag = LanguageTag.Parse(languageTag); - if (prefixTag == null || candidateTag == null) + if (candidateTag == null || string.IsNullOrEmpty(candidateTag.Region)) { return false; } - // The prefix region must be a UN M.49 group (3 digits) and the candidate must have a region - if ( - prefixTag.Region.Length != 3 - || !prefixTag.Region.All(char.IsAsciiDigit) - || string.IsNullOrEmpty(candidateTag.Region) - ) + // Substitute the candidate region with each containing UN M.49 group and retry a plain match + // E.g. es-MX -> es-419, then prefix es-419 matches via the existing prefix rules + // Reusing the plain match keeps the variant, extension, and private use semantics intact + foreach (string ancestor in _unM49.GetAncestors(candidateTag.Region)) { - return false; - } - - // The language portion must be the same, only the region differs - if ( - !prefixTag.Language.Equals(candidateTag.Language, StringComparison.OrdinalIgnoreCase) - || !prefixTag.ExtendedLanguage.Equals( - candidateTag.ExtendedLanguage, - StringComparison.OrdinalIgnoreCase - ) - || !prefixTag.Script.Equals(candidateTag.Script, StringComparison.OrdinalIgnoreCase) - ) - { - return false; + LanguageTag candidateGroup = new(candidateTag) { Region = ancestor }; + if (IsMatch(prefix, candidateGroup.ToString(), false)) + { + return true; + } } - // The candidate region must be contained within the prefix region group - return _unM49.Contains(prefixTag.Region, candidateTag.Region); + // No containing group matched + return false; } /// diff --git a/LanguageTags/UnM49Data.cs b/LanguageTags/UnM49Data.cs index 552b177..25ee08a 100644 --- a/LanguageTags/UnM49Data.cs +++ b/LanguageTags/UnM49Data.cs @@ -295,6 +295,11 @@ ConfiguredTaskAwaitable WriteLineAsync(string value) => /// /// Finds a UN M.49 containment record by region code. /// + /// + /// A code can appear in more than one source record (CLDR splits canonical and grouping overlays, + /// e.g. there are two "001" records), and this returns the first match only. Use + /// or for the complete transitive containment across all records. + /// /// The region code to search for (e.g. "419" or "013"). /// The first matching , or null when no match is found. public UnM49Record? Find(string code) @@ -406,7 +411,8 @@ private Dictionary> BuildAncestorIndex() public sealed record UnM49Record { /// - /// Gets the region code (UN M.49 numeric code, e.g. "419" or "013"). + /// Gets the region code, usually a UN M.49 numeric code (e.g. "419" or "013"), but the CLDR source + /// also includes alphabetic grouping codes (e.g. "EU", "EZ", "UN"). /// public string Code { get; init; } = string.Empty; diff --git a/LanguageTagsTests/LanguageLookupTests.cs b/LanguageTagsTests/LanguageLookupTests.cs index 42f30f4..2a311a9 100644 --- a/LanguageTagsTests/LanguageLookupTests.cs +++ b/LanguageTagsTests/LanguageLookupTests.cs @@ -73,6 +73,9 @@ public void IsMatch(string prefix, string tag, bool match) [InlineData("es-419", "fr-MX", false)] // Language must match [InlineData("es-001", "es-MX", true)] // World contains every region [InlineData("en", "en-US", true)] // Plain matching still works + [InlineData("es-Latn-419", "es-Latn-MX", true)] // Script is preserved + [InlineData("es-419", "es-MX-nedis", true)] // Broad group matches a more specific variant + [InlineData("es-419-nedis", "es-MX", false)] // Prefix variant must still match, no false positive public void IsMatch_RegionContainment(string prefix, string tag, bool match) { LanguageLookup languageLookup = new(); From 998c01fe316086c93b9294276bc0e4e66bc122b9 Mon Sep 17 00:00:00 2001 From: Pieter Viljoen Date: Fri, 26 Jun 2026 09:44:20 -0700 Subject: [PATCH 3/3] Reword ExpandRegion docs to avoid BCP 47 variant ambiguity "variant" has a specific RFC 5646 meaning, so describe the expanded entries as region substituted tags instead, in the XML doc and README. Co-Authored-By: Claude Opus 4.8 (1M context) --- LanguageTags/LanguageLookup.cs | 4 ++-- README.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/LanguageTags/LanguageLookup.cs b/LanguageTags/LanguageLookup.cs index 56a79ca..3e66700 100644 --- a/LanguageTags/LanguageLookup.cs +++ b/LanguageTags/LanguageLookup.cs @@ -321,7 +321,7 @@ private bool IsRegionContainmentMatch(string prefix, string languageTag) } /// - /// Expands the region of a language tag into the tag plus a variant for each containing UN M.49 group. + /// Expands the region of a language tag into the tag plus a region substituted tag for each containing UN M.49 group. /// /// /// For example "es-MX" expands to "es-MX", "es-013", "es-419", "es-019", and "es-001". A tag with @@ -329,7 +329,7 @@ private bool IsRegionContainmentMatch(string prefix, string languageTag) /// matched with plain string comparison without enabling region containment in . /// /// The language tag to expand. - /// The original tag followed by a region substituted variant for each containing group. + /// The original tag followed by a tag with the region replaced by each containing group. /// Thrown when is null. public IEnumerable ExpandRegion(string languageTag) { diff --git a/README.md b/README.md index 9bcb7ea..b613338 100644 --- a/README.md +++ b/README.md @@ -217,7 +217,7 @@ match = languageLookup.IsMatch("es-MX", "es-419", true); // false, not the rever match = languageLookup.IsMatch("es-419", "es-MX"); // false, containment is opt-in ``` -Use `ExpandRegion()` to expand a tag region into the tag plus a variant for each containing UN M.49 group. +Use `ExpandRegion()` to expand a tag region into the tag plus a region substituted tag for each containing UN M.49 group. ```csharp LanguageLookup languageLookup = new();