Add lexeme norm defaults

This commit is contained in:
Ines Montani 2020-09-30 10:20:14 +02:00
parent 6467a560e3
commit 34f9c26c62
16 changed files with 140 additions and 21 deletions

View File

@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class DanishDefaults(Language.Defaults): class DanishDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class GermanDefaults(Language.Defaults): class GermanDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -9,9 +9,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
from .lemmatizer import GreekLemmatizer from .lemmatizer import GreekLemmatizer
from ...lookups import Lookups from ...lookups import Lookups
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class GreekDefaults(Language.Defaults): class GreekDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -1,5 +1,4 @@
from typing import Optional from typing import Optional
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -10,9 +9,21 @@ from .punctuation import TOKENIZER_INFIXES
from .lemmatizer import EnglishLemmatizer from .lemmatizer import EnglishLemmatizer
from ...language import Language from ...language import Language
from ...lookups import Lookups from ...lookups import Lookups
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class EnglishDefaults(Language.Defaults): class EnglishDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS

View File

@ -4,9 +4,21 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class IndonesianDefaults(Language.Defaults): class IndonesianDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -2,7 +2,6 @@ from typing import Optional, Union, Dict, Any
from pathlib import Path from pathlib import Path
import srsly import srsly
from collections import namedtuple from collections import namedtuple
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
@ -16,7 +15,7 @@ from ...scorer import Scorer
from ...symbols import POS from ...symbols import POS
from ...tokens import Doc from ...tokens import Doc
from ...training import validate_examples from ...training import validate_examples
from ...util import DummyTokenizer, registry from ...util import DummyTokenizer, registry, load_config_from_str
from ... import util from ... import util
@ -166,7 +165,7 @@ class JapaneseTokenizer(DummyTokenizer):
class JapaneseDefaults(Language.Defaults): class JapaneseDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG) config = load_config_from_str(DEFAULT_CONFIG)
stop_words = STOP_WORDS stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}

View File

@ -1,5 +1,4 @@
from typing import Optional, Any, Dict from typing import Optional, Any, Dict
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
@ -10,7 +9,7 @@ from ...compat import copy_reg
from ...scorer import Scorer from ...scorer import Scorer
from ...symbols import POS from ...symbols import POS
from ...training import validate_examples from ...training import validate_examples
from ...util import DummyTokenizer, registry from ...util import DummyTokenizer, registry, load_config_from_str
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -70,7 +69,7 @@ class KoreanTokenizer(DummyTokenizer):
class KoreanDefaults(Language.Defaults): class KoreanDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG) config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}

View File

@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class LuxembourgishDefaults(Language.Defaults): class LuxembourgishDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS

View File

@ -3,9 +3,21 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class PortugueseDefaults(Language.Defaults): class PortugueseDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES

View File

@ -1,5 +1,4 @@
from typing import Optional from typing import Optional
from thinc.api import Model from thinc.api import Model
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -8,9 +7,21 @@ from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer from .lemmatizer import RussianLemmatizer
from ...language import Language from ...language import Language
from ...lookups import Lookups from ...lookups import Lookups
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class RussianDefaults(Language.Defaults): class RussianDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -2,9 +2,21 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class SerbianDefaults(Language.Defaults): class SerbianDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,9 +1,21 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class TamilDefaults(Language.Defaults): class TamilDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,10 +1,8 @@
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from ...util import DummyTokenizer, registry from ...util import DummyTokenizer, registry, load_config_from_str
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -12,6 +10,13 @@ DEFAULT_CONFIG = """
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.th.ThaiTokenizer" @tokenizers = "spacy.th.ThaiTokenizer"
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
""" """
@ -42,7 +47,7 @@ class ThaiTokenizer(DummyTokenizer):
class ThaiDefaults(Language.Defaults): class ThaiDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG) config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,10 +1,8 @@
from thinc.api import Config from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from .stop_words import STOP_WORDS from ...util import DummyTokenizer, registry, load_config_from_str
from ...util import DummyTokenizer, registry
from .lex_attrs import LEX_ATTRS
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -55,7 +53,7 @@ class VietnameseTokenizer(DummyTokenizer):
class VietnameseDefaults(Language.Defaults): class VietnameseDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG) config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -4,14 +4,13 @@ import tempfile
import srsly import srsly
import warnings import warnings
from pathlib import Path from pathlib import Path
from thinc.api import Config
from ...errors import Warnings, Errors from ...errors import Warnings, Errors
from ...language import Language from ...language import Language
from ...scorer import Scorer from ...scorer import Scorer
from ...tokens import Doc from ...tokens import Doc
from ...training import validate_examples, Example from ...training import validate_examples, Example
from ...util import DummyTokenizer, registry from ...util import DummyTokenizer, registry, load_config_from_str
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ... import util from ... import util
@ -329,7 +328,7 @@ class ChineseTokenizer(DummyTokenizer):
class ChineseDefaults(Language.Defaults): class ChineseDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG) config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}

View File

@ -339,6 +339,7 @@ def test_ner_warns_no_lookups(caplog):
nlp.vocab.lookups = Lookups() nlp.vocab.lookups = Lookups()
assert not len(nlp.vocab.lookups) assert not len(nlp.vocab.lookups)
nlp.add_pipe("ner") nlp.add_pipe("ner")
nlp.config["initialize"]["lookups"] = None
with caplog.at_level(logging.DEBUG): with caplog.at_level(logging.DEBUG):
nlp.initialize() nlp.initialize()
assert "W033" in caplog.text assert "W033" in caplog.text