diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 97b4db285..2826cd084 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -337,7 +337,7 @@ def debug_data( show=verbose, ) else: - msg.good("Examples without ocurrences available for all labels") + msg.good("Examples without occurrences available for all labels") if "ner" in factory_names: # Get all unique NER labels present in the data diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index 918d4acdc..49e32b936 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -33,6 +33,8 @@ def test_token_morph_key(i_has): def test_morph_props(i_has): assert i_has[0].morph.get("PronType") == ["prs"] assert i_has[1].morph.get("PronType") == [] + assert i_has[1].morph.get("AsdfType", ["asdf"]) == ["asdf"] + assert i_has[1].morph.get("AsdfType", default=["asdf", "qwer"]) == ["asdf", "qwer"] def test_morph_iter(i_has): diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 0e75b5f7a..a4a68ae8e 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -834,10 +834,12 @@ cdef class Tokenizer: self.token_match = re.compile(data["token_match"]).match if "url_match" in data and isinstance(data["url_match"], str): self.url_match = re.compile(data["url_match"]).match - if "rules" in data and isinstance(data["rules"], dict): - self.rules = data["rules"] if "faster_heuristics" in data: self.faster_heuristics = data["faster_heuristics"] + # always load rules last so that all other settings are set before the + # internal tokenization for the phrase matcher + if "rules" in data and isinstance(data["rules"], dict): + self.rules = data["rules"] return self diff --git a/spacy/tokens/morphanalysis.pyi b/spacy/tokens/morphanalysis.pyi index b86203cc4..a5376e80d 100644 --- a/spacy/tokens/morphanalysis.pyi +++ b/spacy/tokens/morphanalysis.pyi @@ -1,4 +1,4 @@ -from typing import Any, Dict, Iterator, List, Union +from typing import Any, Dict, Iterator, List, Optional, Union from ..vocab import Vocab class MorphAnalysis: @@ -13,7 +13,7 @@ class MorphAnalysis: def __hash__(self) -> int: ... def __eq__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override] def __ne__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override] - def get(self, field: Any) -> List[str]: ... + def get(self, field: Any, default: Optional[List[str]]) -> List[str]: ... def to_json(self) -> str: ... def to_dict(self) -> Dict[str, str]: ... def __str__(self) -> str: ... diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index a7d1f2e44..baa3800a1 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -58,10 +58,14 @@ cdef class MorphAnalysis: def __ne__(self, other): return self.key != other.key - def get(self, field): + def get(self, field, default=None): """Retrieve feature values by field.""" cdef attr_t field_id = self.vocab.strings.as_int(field) cdef np.ndarray results = get_by_field(&self.c, field_id) + if len(results) == 0: + if default is None: + default = [] + return default features = [self.vocab.strings[result] for result in results] return [f.split(Morphology.FIELD_SEP)[1] for f in features] diff --git a/website/docs/api/morphology.mdx b/website/docs/api/morphology.mdx index 68d80b814..5d4affafe 100644 --- a/website/docs/api/morphology.mdx +++ b/website/docs/api/morphology.mdx @@ -213,10 +213,11 @@ Retrieve values for a feature by field. > assert morph.get("Feat1") == ["Val1", "Val2"] > ``` -| Name | Description | -| ----------- | ------------------------------------------------ | -| `field` | The field to retrieve. ~~str~~ | -| **RETURNS** | A list of the individual features. ~~List[str]~~ | +| Name | Description | +| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ | +| `field` | The field to retrieve. ~~str~~ | +| `default` 3.6 | The value to return if the field is not present. If unset or `None`, the default return value is `[]`. ~~Optional[List[str]]~~ | +| **RETURNS** | A list of the individual features. ~~List[str]~~ | ### MorphAnalysis.to_dict {id="morphanalysis-to_dict",tag="method"}