Merge branch 'explosion:master' into la-refactor

This commit is contained in:
Patrick J. Burns 2023-04-20 09:35:14 -04:00 committed by GitHub
commit cc238c83e8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 19 additions and 10 deletions

View File

@ -337,7 +337,7 @@ def debug_data(
show=verbose,
)
else:
msg.good("Examples without ocurrences available for all labels")
msg.good("Examples without occurrences available for all labels")
if "ner" in factory_names:
# Get all unique NER labels present in the data

View File

@ -33,6 +33,8 @@ def test_token_morph_key(i_has):
def test_morph_props(i_has):
assert i_has[0].morph.get("PronType") == ["prs"]
assert i_has[1].morph.get("PronType") == []
assert i_has[1].morph.get("AsdfType", ["asdf"]) == ["asdf"]
assert i_has[1].morph.get("AsdfType", default=["asdf", "qwer"]) == ["asdf", "qwer"]
def test_morph_iter(i_has):

View File

@ -834,10 +834,12 @@ cdef class Tokenizer:
self.token_match = re.compile(data["token_match"]).match
if "url_match" in data and isinstance(data["url_match"], str):
self.url_match = re.compile(data["url_match"]).match
if "rules" in data and isinstance(data["rules"], dict):
self.rules = data["rules"]
if "faster_heuristics" in data:
self.faster_heuristics = data["faster_heuristics"]
# always load rules last so that all other settings are set before the
# internal tokenization for the phrase matcher
if "rules" in data and isinstance(data["rules"], dict):
self.rules = data["rules"]
return self

View File

@ -1,4 +1,4 @@
from typing import Any, Dict, Iterator, List, Union
from typing import Any, Dict, Iterator, List, Optional, Union
from ..vocab import Vocab
class MorphAnalysis:
@ -13,7 +13,7 @@ class MorphAnalysis:
def __hash__(self) -> int: ...
def __eq__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override]
def __ne__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override]
def get(self, field: Any) -> List[str]: ...
def get(self, field: Any, default: Optional[List[str]]) -> List[str]: ...
def to_json(self) -> str: ...
def to_dict(self) -> Dict[str, str]: ...
def __str__(self) -> str: ...

View File

@ -58,10 +58,14 @@ cdef class MorphAnalysis:
def __ne__(self, other):
return self.key != other.key
def get(self, field):
def get(self, field, default=None):
"""Retrieve feature values by field."""
cdef attr_t field_id = self.vocab.strings.as_int(field)
cdef np.ndarray results = get_by_field(&self.c, field_id)
if len(results) == 0:
if default is None:
default = []
return default
features = [self.vocab.strings[result] for result in results]
return [f.split(Morphology.FIELD_SEP)[1] for f in features]

View File

@ -213,10 +213,11 @@ Retrieve values for a feature by field.
> assert morph.get("Feat1") == ["Val1", "Val2"]
> ```
| Name | Description |
| ----------- | ------------------------------------------------ |
| `field` | The field to retrieve. ~~str~~ |
| **RETURNS** | A list of the individual features. ~~List[str]~~ |
| Name | Description |
| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
| `field` | The field to retrieve. ~~str~~ |
| `default` <Tag variant="new">3.6</Tag> | The value to return if the field is not present. If unset or `None`, the default return value is `[]`. ~~Optional[List[str]]~~ |
| **RETURNS** | A list of the individual features. ~~List[str]~~ |
### MorphAnalysis.to_dict {id="morphanalysis-to_dict",tag="method"}