Add lexeme norm defaults

This commit is contained in:
Ines Montani 2020-09-30 10:20:14 +02:00
parent 6467a560e3
commit 34f9c26c62
16 changed files with 140 additions and 21 deletions

View File

@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class DanishDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES

View File

@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class GermanDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES

View File

@ -9,9 +9,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
from .lemmatizer import GreekLemmatizer
from ...lookups import Lookups
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class GreekDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES

View File

@ -1,5 +1,4 @@
from typing import Optional
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -10,9 +9,21 @@ from .punctuation import TOKENIZER_INFIXES
from .lemmatizer import EnglishLemmatizer
from ...language import Language
from ...lookups import Lookups
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class EnglishDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS

View File

@ -4,9 +4,21 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class IndonesianDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES

View File

@ -2,7 +2,6 @@ from typing import Optional, Union, Dict, Any
from pathlib import Path
import srsly
from collections import namedtuple
from thinc.api import Config
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
@ -16,7 +15,7 @@ from ...scorer import Scorer
from ...symbols import POS
from ...tokens import Doc
from ...training import validate_examples
from ...util import DummyTokenizer, registry
from ...util import DummyTokenizer, registry, load_config_from_str
from ... import util
@ -166,7 +165,7 @@ class JapaneseTokenizer(DummyTokenizer):
class JapaneseDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
config = load_config_from_str(DEFAULT_CONFIG)
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}

View File

@ -1,5 +1,4 @@
from typing import Optional, Any, Dict
from thinc.api import Config
from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP
@ -10,7 +9,7 @@ from ...compat import copy_reg
from ...scorer import Scorer
from ...symbols import POS
from ...training import validate_examples
from ...util import DummyTokenizer, registry
from ...util import DummyTokenizer, registry, load_config_from_str
DEFAULT_CONFIG = """
@ -70,7 +69,7 @@ class KoreanTokenizer(DummyTokenizer):
class KoreanDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}

View File

@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class LuxembourgishDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS

View File

@ -3,9 +3,21 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class PortugueseDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES

View File

@ -1,5 +1,4 @@
from typing import Optional
from thinc.api import Model
from .stop_words import STOP_WORDS
@ -8,9 +7,21 @@ from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer
from ...language import Language
from ...lookups import Lookups
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class RussianDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS

View File

@ -2,9 +2,21 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class SerbianDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS

View File

@ -1,9 +1,21 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...util import load_config_from_str
DEFAULT_CONFIG = """
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
class TamilDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS

View File

@ -1,10 +1,8 @@
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...tokens import Doc
from ...util import DummyTokenizer, registry
from ...util import DummyTokenizer, registry, load_config_from_str
DEFAULT_CONFIG = """
@ -12,6 +10,13 @@ DEFAULT_CONFIG = """
[nlp.tokenizer]
@tokenizers = "spacy.th.ThaiTokenizer"
[initialize]
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
"""
@ -42,7 +47,7 @@ class ThaiTokenizer(DummyTokenizer):
class ThaiDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS

View File

@ -1,10 +1,8 @@
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...tokens import Doc
from .stop_words import STOP_WORDS
from ...util import DummyTokenizer, registry
from .lex_attrs import LEX_ATTRS
from ...util import DummyTokenizer, registry, load_config_from_str
DEFAULT_CONFIG = """
@ -55,7 +53,7 @@ class VietnameseTokenizer(DummyTokenizer):
class VietnameseDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS

View File

@ -4,14 +4,13 @@ import tempfile
import srsly
import warnings
from pathlib import Path
from thinc.api import Config
from ...errors import Warnings, Errors
from ...language import Language
from ...scorer import Scorer
from ...tokens import Doc
from ...training import validate_examples, Example
from ...util import DummyTokenizer, registry
from ...util import DummyTokenizer, registry, load_config_from_str
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from ... import util
@ -329,7 +328,7 @@ class ChineseTokenizer(DummyTokenizer):
class ChineseDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}

View File

@ -339,6 +339,7 @@ def test_ner_warns_no_lookups(caplog):
nlp.vocab.lookups = Lookups()
assert not len(nlp.vocab.lookups)
nlp.add_pipe("ner")
nlp.config["initialize"]["lookups"] = None
with caplog.at_level(logging.DEBUG):
nlp.initialize()
assert "W033" in caplog.text