mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Improve vocab data integration and warning
This commit is contained in:
parent
38f6ea7a78
commit
b9aaa4e457
|
@ -1,6 +1,5 @@
|
|||
[nlp]
|
||||
lang = null
|
||||
vocab_data = {}
|
||||
pipeline = []
|
||||
|
||||
[nlp.tokenizer]
|
||||
|
|
|
@ -83,7 +83,7 @@ class Warnings:
|
|||
"doesn't have a normalization table, please ignore this warning. "
|
||||
"If this is surprising, make sure you have the spacy-lookups-data "
|
||||
"package installed. The languages with lexeme normalization tables "
|
||||
"are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
|
||||
"are currently: {langs}")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
|
||||
|
|
|
@ -184,8 +184,10 @@ class Tagger(Pipe):
|
|||
lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
||||
if not any(table in self.vocab.lookups for table in lemma_tables):
|
||||
warnings.warn(Warnings.W022)
|
||||
if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0:
|
||||
warnings.warn(Warnings.W033.format(model="part-of-speech tagger"))
|
||||
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
||||
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
|
||||
langs = ", ".join(util.LEXEME_NORM_LANGS)
|
||||
warnings.warn(Warnings.W033.format(model="part-of-speech tagger", langs=langs))
|
||||
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
||||
new_tag_map = {}
|
||||
for example in get_examples():
|
||||
|
|
|
@ -239,7 +239,6 @@ class ConfigSchemaNlp(BaseModel):
|
|||
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
|
||||
tokenizer: Callable = Field(..., title="The tokenizer to use")
|
||||
lemmatizer: Callable = Field(..., title="The lemmatizer to use")
|
||||
vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables")
|
||||
# fmt: on
|
||||
|
||||
class Config:
|
||||
|
|
|
@ -431,8 +431,10 @@ cdef class Parser:
|
|||
|
||||
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
|
||||
self.cfg.update(kwargs)
|
||||
if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0:
|
||||
warnings.warn(Warnings.W033.format(model="parser or NER"))
|
||||
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
||||
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
|
||||
langs = ", ".join(util.LEXEME_NORM_LANGS)
|
||||
warnings.warn(Warnings.W033.format(model="parser or NER", langs=langs))
|
||||
if not hasattr(get_examples, '__call__'):
|
||||
gold_tuples = get_examples
|
||||
get_examples = lambda: gold_tuples
|
||||
|
|
|
@ -342,7 +342,8 @@ def test_overfitting_IO():
|
|||
|
||||
|
||||
def test_ner_warns_no_lookups():
|
||||
nlp = Language()
|
||||
nlp = English()
|
||||
assert nlp.lang in util.LEXEME_NORM_LANGS
|
||||
nlp.vocab.lookups = Lookups()
|
||||
assert not len(nlp.vocab.lookups)
|
||||
nlp.add_pipe("ner")
|
||||
|
|
|
@ -53,6 +53,7 @@ if TYPE_CHECKING:
|
|||
|
||||
_PRINT_ENV = False
|
||||
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
||||
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
|
||||
|
||||
|
||||
class registry(thinc.registry):
|
||||
|
|
|
@ -17,13 +17,20 @@ from .lemmatizer import Lemmatizer
|
|||
from .attrs import intify_attrs, NORM, IS_STOP
|
||||
from .vectors import Vectors
|
||||
from .util import link_vectors_to_models, registry
|
||||
from .lookups import Lookups
|
||||
from .lookups import Lookups, load_lookups
|
||||
from . import util
|
||||
from .lang.norm_exceptions import BASE_NORMS
|
||||
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
|
||||
|
||||
|
||||
def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=None):
|
||||
def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_lookups_data=True):
|
||||
# If the spacy-lookups-data package is installed, we pre-populate the lookups
|
||||
# with lexeme data, if available
|
||||
if load_lookups_data:
|
||||
tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
|
||||
lookups = load_lookups(lang, tables=tables, strict=False)
|
||||
else:
|
||||
lookups = Lookups()
|
||||
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
|
||||
# This is messy, but it's the minimal working fix to Issue #639.
|
||||
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
|
||||
|
@ -32,13 +39,8 @@ def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=No
|
|||
lex_attrs[NORM] = util.add_lookups(
|
||||
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
|
||||
BASE_NORMS,
|
||||
vocab_data.get("lexeme_norm", {}),
|
||||
lookups.get_table("lexeme_norm", {}),
|
||||
)
|
||||
lookups = Lookups()
|
||||
for name, data in vocab_data.items():
|
||||
if name not in lookups:
|
||||
data = data if data is not None else {}
|
||||
lookups.add_table(name, data)
|
||||
return Vocab(
|
||||
lex_attr_getters=lex_attrs,
|
||||
lemmatizer=lemmatizer,
|
||||
|
@ -49,7 +51,6 @@ def create_vocab(lang, defaults, lemmatizer=None, vocab_data={}, vectors_name=No
|
|||
)
|
||||
|
||||
|
||||
|
||||
cdef class Vocab:
|
||||
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
|
||||
instance also provides access to the `StringStore`, and owns underlying
|
||||
|
|
Loading…
Reference in New Issue
Block a user