mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Update Catalan language data (#8308)
* Update Catalan language data Update Catalan language data based on contributions from the Text Mining Unit at the Barcelona Supercomputing Center: https://github.com/TeMU-BSC/spacy4release/tree/main/lang_data * Update tokenizer settings for UD Catalan AnCora Update for UD Catalan AnCora v2.7 with merged multi-word tokens. * Update test * Move prefix patternt to more generic infix pattern * Clean up
This commit is contained in:
		
							parent
							
								
									d9be9e6cf9
								
							
						
					
					
						commit
						b98d216205
					
				|  | @ -65,7 +65,7 @@ console_scripts = | |||
| 
 | ||||
| [options.extras_require] | ||||
| lookups = | ||||
|     spacy_lookups_data>=1.0.0,<1.1.0 | ||||
|     spacy_lookups_data>=1.0.1,<1.1.0 | ||||
| transformers = | ||||
|     spacy_transformers>=1.0.1,<1.1.0 | ||||
| ray = | ||||
|  |  | |||
|  | @ -1,15 +1,23 @@ | |||
| from typing import Optional | ||||
| 
 | ||||
| from thinc.api import Model | ||||
| 
 | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .punctuation import TOKENIZER_INFIXES | ||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from ...language import Language | ||||
| from .lemmatizer import CatalanLemmatizer | ||||
| 
 | ||||
| 
 | ||||
| class CatalanDefaults(Language.Defaults): | ||||
|     tokenizer_exceptions = TOKENIZER_EXCEPTIONS | ||||
|     infixes = TOKENIZER_INFIXES | ||||
|     suffixes = TOKENIZER_SUFFIXES | ||||
|     stop_words = STOP_WORDS | ||||
|     lex_attr_getters = LEX_ATTRS | ||||
|     syntax_iterators = SYNTAX_ITERATORS | ||||
| 
 | ||||
| 
 | ||||
| class Catalan(Language): | ||||
|  | @ -17,4 +25,16 @@ class Catalan(Language): | |||
|     Defaults = CatalanDefaults | ||||
| 
 | ||||
| 
 | ||||
| @Catalan.factory( | ||||
|     "lemmatizer", | ||||
|     assigns=["token.lemma"], | ||||
|     default_config={"model": None, "mode": "rule", "overwrite": False}, | ||||
|     default_score_weights={"lemma_acc": 1.0}, | ||||
| ) | ||||
| def make_lemmatizer( | ||||
|     nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool | ||||
| ): | ||||
|     return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) | ||||
| 
 | ||||
| 
 | ||||
| __all__ = ["Catalan"] | ||||
|  |  | |||
							
								
								
									
										81
									
								
								spacy/lang/ca/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										81
									
								
								spacy/lang/ca/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,81 @@ | |||
| from typing import List, Tuple | ||||
| 
 | ||||
| from ...pipeline import Lemmatizer | ||||
| from ...tokens import Token | ||||
| 
 | ||||
| 
 | ||||
| class CatalanLemmatizer(Lemmatizer): | ||||
|     """ | ||||
|     Copied from French Lemmatizer | ||||
|     Catalan language lemmatizer applies the default rule based lemmatization | ||||
|     procedure with some modifications for better Catalan language support. | ||||
| 
 | ||||
|     The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use | ||||
|     the rule-based lemmatization. As a last resort, the lemmatizer checks in | ||||
|     the lookup table. | ||||
|     """ | ||||
| 
 | ||||
|     @classmethod | ||||
|     def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: | ||||
|         if mode == "rule": | ||||
|             required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] | ||||
|             return (required, []) | ||||
|         else: | ||||
|             return super().get_lookups_config(mode) | ||||
| 
 | ||||
|     def rule_lemmatize(self, token: Token) -> List[str]: | ||||
|         cache_key = (token.orth, token.pos) | ||||
|         if cache_key in self.cache: | ||||
|             return self.cache[cache_key] | ||||
|         string = token.text | ||||
|         univ_pos = token.pos_.lower() | ||||
|         if univ_pos in ("", "eol", "space"): | ||||
|             return [string.lower()] | ||||
|         elif "lemma_rules" not in self.lookups or univ_pos not in ( | ||||
|             "noun", | ||||
|             "verb", | ||||
|             "adj", | ||||
|             "adp", | ||||
|             "adv", | ||||
|             "aux", | ||||
|             "cconj", | ||||
|             "det", | ||||
|             "pron", | ||||
|             "punct", | ||||
|             "sconj", | ||||
|         ): | ||||
|             return self.lookup_lemmatize(token) | ||||
|         index_table = self.lookups.get_table("lemma_index", {}) | ||||
|         exc_table = self.lookups.get_table("lemma_exc", {}) | ||||
|         rules_table = self.lookups.get_table("lemma_rules", {}) | ||||
|         lookup_table = self.lookups.get_table("lemma_lookup", {}) | ||||
|         index = index_table.get(univ_pos, {}) | ||||
|         exceptions = exc_table.get(univ_pos, {}) | ||||
|         rules = rules_table.get(univ_pos, []) | ||||
|         string = string.lower() | ||||
|         forms = [] | ||||
|         if string in index: | ||||
|             forms.append(string) | ||||
|             self.cache[cache_key] = forms | ||||
|             return forms | ||||
|         forms.extend(exceptions.get(string, [])) | ||||
|         oov_forms = [] | ||||
|         if not forms: | ||||
|             for old, new in rules: | ||||
|                 if string.endswith(old): | ||||
|                     form = string[: len(string) - len(old)] + new | ||||
|                     if not form: | ||||
|                         pass | ||||
|                     elif form in index or not form.isalpha(): | ||||
|                         forms.append(form) | ||||
|                     else: | ||||
|                         oov_forms.append(form) | ||||
|         if not forms: | ||||
|             forms.extend(oov_forms) | ||||
|         if not forms and string in lookup_table.keys(): | ||||
|             forms.append(self.lookup_lemmatize(token)[0]) | ||||
|         if not forms: | ||||
|             forms.append(string) | ||||
|         forms = list(set(forms)) | ||||
|         self.cache[cache_key] = forms | ||||
|         return forms | ||||
|  | @ -1,12 +1,46 @@ | |||
| from ..punctuation import TOKENIZER_INFIXES | ||||
| from ..char_classes import ALPHA | ||||
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS | ||||
| from ..char_classes import CURRENCY | ||||
| from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT | ||||
| from ..char_classes import merge_chars, _units | ||||
| 
 | ||||
| 
 | ||||
| ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") | ||||
| 
 | ||||
| 
 | ||||
| _infixes = TOKENIZER_INFIXES + [ | ||||
|     r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) | ||||
| ] | ||||
| _infixes = ( | ||||
|     LIST_ELLIPSES | ||||
|     + LIST_ICONS | ||||
|     + [ | ||||
|         r"(?<=[0-9])[+\-\*^](?=[0-9-])", | ||||
|         r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( | ||||
|             al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES | ||||
|         ), | ||||
|         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION), | ||||
|     ] | ||||
| ) | ||||
| 
 | ||||
| _units = _units.replace("% ", "") | ||||
| UNITS = merge_chars(_units) | ||||
| 
 | ||||
| _suffixes = ( | ||||
|     LIST_PUNCT | ||||
|     + LIST_ELLIPSES | ||||
|     + LIST_QUOTES | ||||
|     + LIST_ICONS | ||||
|     + [r"-", "—", "–"] | ||||
|     + [ | ||||
|         r"(?<=[0-9])\+", | ||||
|         r"(?<=°[FfCcKk])\.", | ||||
|         r"(?<=[0-9])(?:{c})".format(c=CURRENCY), | ||||
|         r"(?<=[0-9])(?:{u})".format(u=UNITS), | ||||
|         r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format( | ||||
|             al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT | ||||
|         ), | ||||
|         r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), | ||||
|     ] | ||||
| ) | ||||
| 
 | ||||
| TOKENIZER_INFIXES = _infixes | ||||
| TOKENIZER_SUFFIXES = _suffixes | ||||
|  |  | |||
							
								
								
									
										46
									
								
								spacy/lang/ca/syntax_iterators.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								spacy/lang/ca/syntax_iterators.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,46 @@ | |||
| from ...symbols import NOUN, PROPN | ||||
| from ...errors import Errors | ||||
| 
 | ||||
| 
 | ||||
| def noun_chunks(doclike): | ||||
|     """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" | ||||
|     # fmt: off | ||||
|     labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] | ||||
|     # fmt: on | ||||
|     doc = doclike.doc  # Ensure works on both Doc and Span. | ||||
|     if not doc.has_annotation("DEP"): | ||||
|         raise ValueError(Errors.E029) | ||||
|     np_deps = [doc.vocab.strings[label] for label in labels] | ||||
|     np_label = doc.vocab.strings.add("NP") | ||||
|     prev_end = -1 | ||||
|     for i, word in enumerate(doclike): | ||||
|         if word.pos not in (NOUN, PROPN): | ||||
|             continue | ||||
|         # Prevent nested chunks from being produced | ||||
|         if word.left_edge.i <= prev_end: | ||||
|             continue | ||||
|         if word.dep in np_deps: | ||||
|             left = word.left_edge.i | ||||
|             right = word.right_edge.i + 1 | ||||
|             # leave prepositions and punctuation out of the left side of the chunk | ||||
|             if word.left_edge.pos_ == "ADP" or word.left_edge.pos_ == "PUNCT": | ||||
|                 left = word.left_edge.i + 1 | ||||
|             prev_end = word.right_edge.i | ||||
|             # leave subordinated clauses and appositions out of the chunk | ||||
|             a = word.i + 1 | ||||
|             while a < word.right_edge.i: | ||||
|                 paraula = doc[a] | ||||
|                 if paraula.pos_ == "VERB": | ||||
|                     right = paraula.left_edge.i | ||||
|                     prev_end = paraula.left_edge.i - 1 | ||||
|                 elif paraula.dep_ == "appos": | ||||
|                     right = paraula.left_edge.i + 1 | ||||
|                     prev_end = paraula.left_edge.i - 1 | ||||
|                 a += 1 | ||||
|             # leave punctuation out of the right side of the chunk | ||||
|             if word.right_edge.pos_ == "PUNCT": | ||||
|                 right = right - 1 | ||||
|             yield left, right, np_label | ||||
| 
 | ||||
| 
 | ||||
| SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} | ||||
|  | @ -24,6 +24,13 @@ for exc_data in [ | |||
|     {ORTH: "núm", NORM: "número"}, | ||||
|     {ORTH: "St.", NORM: "sant"}, | ||||
|     {ORTH: "Sta.", NORM: "santa"}, | ||||
|     {ORTH: "'l"}, | ||||
|     {ORTH: "'ls"}, | ||||
|     {ORTH: "'m"}, | ||||
|     {ORTH: "'n"}, | ||||
|     {ORTH: "'ns"}, | ||||
|     {ORTH: "'s"}, | ||||
|     {ORTH: "'t"}, | ||||
| ]: | ||||
|     _exc[exc_data[ORTH]] = [exc_data] | ||||
| 
 | ||||
|  |  | |||
|  | @ -12,13 +12,13 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer): | |||
|     una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida.""" | ||||
| 
 | ||||
|     tokens = ca_tokenizer(text) | ||||
|     assert len(tokens) == 138 | ||||
|     assert len(tokens) == 140 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "text,length", | ||||
|     [ | ||||
|         ("Perquè va anar-hi?", 6), | ||||
|         ("Perquè va anar-hi?", 4), | ||||
|         ("“Ah no?”", 5), | ||||
|         ("""Sí! "Anem", va contestar el Joan Carles""", 11), | ||||
|         ("Van córrer aprox. 10km", 5), | ||||
|  |  | |||
|  | @ -8,7 +8,7 @@ from spacy.util import get_lang_class | |||
| # Only include languages with no external dependencies | ||||
| # excluded: ru, uk | ||||
| # excluded for custom tables: es, pl | ||||
| LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"] | ||||
| LANGUAGES = ["bn", "ca", "el", "en", "fa", "fr", "nb", "nl", "sv"] | ||||
| # fmt: on | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user