From eb544903ecbda18aadafd4c4f538d7f6a6f410b6 Mon Sep 17 00:00:00 2001 From: "Patrick J. Burns" Date: Thu, 20 Apr 2023 09:34:48 -0400 Subject: [PATCH] Minor updates based on review --- spacy/lang/la/lex_attrs.py | 2 ++ spacy/lang/la/syntax_iterators.py | 4 +++- spacy/lang/la/tokenizer_exceptions.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/spacy/lang/la/lex_attrs.py b/spacy/lang/la/lex_attrs.py index 1a3d67e9e..9db1218a4 100644 --- a/spacy/lang/la/lex_attrs.py +++ b/spacy/lang/la/lex_attrs.py @@ -10,10 +10,12 @@ _num_words = """unus una unum duo duae tres tria quattuor quinque sex septem oct """.split() _num_words += [item.replace("v", "u") for item in _num_words] +_num_words = set(_num_words) _ordinal_words = """primus prima primum secundus secunda secundum tertius tertia tertium quartus quarta quartum quintus quinta quintum sextus sexta sextum septimus septima septimum octavus octava octavum nonus nona nonum decimus decima decimum undecimus undecima undecimum duodecimus duodecima duodecimum duodevicesimus duodevicesima duodevicesimum undevicesimus undevicesima undevicesimum vicesimus vicesima vicesimum tricesimus tricesima tricesimum quadragesimus quadragesima quadragesimum quinquagesimus quinquagesima quinquagesimum sexagesimus sexagesima sexagesimum septuagesimus septuagesima septuagesimum octogesimus octogesima octogesimum nonagesimus nonagesima nonagesimum centesimus centesima centesimum ducentesimus ducentesima ducentesimum trecentesimus trecentesima trecentesimum quadringentesimus quadringentesima quadringentesimum quingentesimus quingentesima quingentesimum sescentesimus sescentesima sescentesimum septingentesimus septingentesima septingentesimum octingentesimus octingentesima octingentesimum nongentesimus nongentesima nongentesimum millesimus millesima millesimum""".split() _ordinal_words += [item.replace("v", "u") for item in _ordinal_words] +_ordinal_words = set(_ordinal_words) def like_num(text): diff --git a/spacy/lang/la/syntax_iterators.py b/spacy/lang/la/syntax_iterators.py index b5ea8b276..111248cda 100644 --- a/spacy/lang/la/syntax_iterators.py +++ b/spacy/lang/la/syntax_iterators.py @@ -4,11 +4,13 @@ from ...symbols import NOUN, PROPN, PRON, VERB, AUX from ...errors import Errors # NB: Modified from da on suggestion from https://github.com/explosion/spaCy/issues/7457#issuecomment-800349751 [PJB] + + def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: def is_verb_token(tok): return tok.pos in [VERB, AUX] - def get_left_bound(doc, root): + def get_left_bound(root): left_bound = root for tok in reversed(list(root.lefts)): if tok.dep in np_left_deps: diff --git a/spacy/lang/la/tokenizer_exceptions.py b/spacy/lang/la/tokenizer_exceptions.py index e9934fcef..6d14b92c5 100644 --- a/spacy/lang/la/tokenizer_exceptions.py +++ b/spacy/lang/la/tokenizer_exceptions.py @@ -20,7 +20,7 @@ _abbrev_exc += [item.replace("v", "u").replace("V", "U") for item in _abbrev_exc _abbrev_exc += ["d.N."] -for orth in _abbrev_exc: +for orth in set(_abbrev_exc): _exc[orth] = [{ORTH: orth}] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)