spaCy/spacy/lang/nl/__init__.py
Julien Rossi e117573822
Adding noun_chunks to the DUTCH language model (nl) (#8529)
*  implement noun_chunks for dutch language

* copy/paste FR and SV syntax iterators to accomodate UD tags
* added tests with dutch text
* signed contributor agreement

* 🐛 fix noun chunks generator

* built from scratch
* define noun chunk as a single Noun-Phrase
* includes some corner cases debugging (incorrect POS tagging)
* test with provided annotated sample (POS, DEP)

*  fix failing test

* CI pipeline did not like the added sample file
* add the sample as a pytest fixture

* Update spacy/lang/nl/syntax_iterators.py

* Update spacy/lang/nl/syntax_iterators.py

Code readability

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/tests/lang/nl/test_noun_chunks.py

correct comment

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* finalize code

* change "if next_word" into "if next_word is not None"

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2021-07-14 14:01:02 +02:00

43 lines
1.1 KiB
Python

from typing import Optional
from thinc.api import Model
from .lemmatizer import DutchLemmatizer
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ...language import Language
class DutchDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
class Dutch(Language):
lang = "nl"
Defaults = DutchDefaults
@Dutch.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
):
return DutchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
__all__ = ["Dutch"]