mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-04 21:50:35 +03:00
Merge branch 'develop' into pr/5767
This commit is contained in:
commit
796f6c52d1
|
@ -12,11 +12,11 @@ from ud_train import write_conllu
|
||||||
from spacy.lang.lex_attrs import word_shape
|
from spacy.lang.lex_attrs import word_shape
|
||||||
from spacy.util import get_lang_class
|
from spacy.util import get_lang_class
|
||||||
|
|
||||||
# All languages in spaCy - in UD format (note that Norwegian is 'no' instead of 'nb')
|
# All languages in spaCy format (note that Norwegian is 'no' in UD - gets remapped later)
|
||||||
ALL_LANGUAGES = ("af, ar, bg, bn, ca, cs, da, de, el, en, es, et, fa, fi, fr,"
|
ALL_LANGUAGES = ("af, ar, bg, bn, ca, cs, da, de, el, en, es, et, eu, fa, fi, fr,"
|
||||||
"ga, he, hi, hr, hu, id, is, it, ja, kn, ko, lt, lv, mr, no,"
|
"ga, gu, he, hi, hr, hu, hy, id, is, it, ja, kn, ko, lb, lij, lt, lv, ml, mr, nb,"
|
||||||
"nl, pl, pt, ro, ru, si, sk, sl, sq, sr, sv, ta, te, th, tl,"
|
"nl, pl, pt, ro, ru, si, sk, sl, sq, sr, sv, ta, te, th, tl,"
|
||||||
"tr, tt, uk, ur, vi, zh")
|
"tr, tt, uk, ur, vi, yo, zh")
|
||||||
|
|
||||||
# Non-parsing tasks that will be evaluated (works for default models)
|
# Non-parsing tasks that will be evaluated (works for default models)
|
||||||
EVAL_NO_PARSE = ['Tokens', 'Words', 'Lemmas', 'Sentences', 'Feats']
|
EVAL_NO_PARSE = ['Tokens', 'Words', 'Lemmas', 'Sentences', 'Feats']
|
||||||
|
@ -251,39 +251,43 @@ def main(out_path, ud_dir, check_parse=False, langs=ALL_LANGUAGES, exclude_train
|
||||||
|
|
||||||
# initialize all models with the multi-lang model
|
# initialize all models with the multi-lang model
|
||||||
for lang in languages:
|
for lang in languages:
|
||||||
models[lang] = [multi] if multi else []
|
UD_lang = lang
|
||||||
|
# Norwegian is 'nb' in spaCy but 'no' in the UD corpora
|
||||||
|
if lang == "nb":
|
||||||
|
UD_lang = "no"
|
||||||
|
try:
|
||||||
|
models[UD_lang] = [multi] if multi else []
|
||||||
# add default models if we don't want to evaluate parsing info
|
# add default models if we don't want to evaluate parsing info
|
||||||
if not check_parse:
|
if not check_parse:
|
||||||
# Norwegian is 'nb' in spaCy but 'no' in the UD corpora
|
models[UD_lang].append(load_default_model_sentencizer(lang))
|
||||||
if lang == 'no':
|
except:
|
||||||
models['no'].append(load_default_model_sentencizer('nb'))
|
print(f"Exception initializing lang {lang} - skipping")
|
||||||
else:
|
|
||||||
models[lang].append(load_default_model_sentencizer(lang))
|
|
||||||
|
|
||||||
# language-specific trained models
|
# language-specific trained models
|
||||||
if not exclude_trained_models:
|
if not exclude_trained_models:
|
||||||
if 'de' in models:
|
news_languages = ["da", "de", "el", "es", "fr", "it", "ja", "lt", "nb", "nl", "pl", "pt", "ro"]
|
||||||
models['de'].append(load_model('de_core_news_sm'))
|
news_languages = ["nb"]
|
||||||
models['de'].append(load_model('de_core_news_md'))
|
web_languages = ["en", "zh"]
|
||||||
if 'el' in models:
|
sizes = ["sm", "md", "lg"]
|
||||||
models['el'].append(load_model('el_core_news_sm'))
|
for lang in web_languages:
|
||||||
models['el'].append(load_model('el_core_news_md'))
|
UD_lang = lang
|
||||||
if 'en' in models:
|
for size in sizes:
|
||||||
models['en'].append(load_model('en_core_web_sm'))
|
model_name = f'{lang}_core_web_{size}'
|
||||||
models['en'].append(load_model('en_core_web_md'))
|
try:
|
||||||
models['en'].append(load_model('en_core_web_lg'))
|
models[UD_lang].append(load_model(model_name))
|
||||||
if 'es' in models:
|
except Exception as e:
|
||||||
models['es'].append(load_model('es_core_news_sm'))
|
print(f"Error loading {model_name}: {e}")
|
||||||
models['es'].append(load_model('es_core_news_md'))
|
|
||||||
if 'fr' in models:
|
for lang in news_languages:
|
||||||
models['fr'].append(load_model('fr_core_news_sm'))
|
UD_lang = lang
|
||||||
models['fr'].append(load_model('fr_core_news_md'))
|
if lang == "nb":
|
||||||
if 'it' in models:
|
UD_lang = "no"
|
||||||
models['it'].append(load_model('it_core_news_sm'))
|
for size in sizes:
|
||||||
if 'nl' in models:
|
model_name = f'{lang}_core_news_{size}'
|
||||||
models['nl'].append(load_model('nl_core_news_sm'))
|
try:
|
||||||
if 'pt' in models:
|
models[UD_lang].append(load_model(model_name))
|
||||||
models['pt'].append(load_model('pt_core_news_sm'))
|
except Exception as e:
|
||||||
|
print(f"Error loading {model_name}: {e}")
|
||||||
|
|
||||||
with out_path.open(mode='w', encoding='utf-8') as out_file:
|
with out_path.open(mode='w', encoding='utf-8') as out_file:
|
||||||
run_all_evals(models, treebanks, out_file, check_parse, print_freq_tasks)
|
run_all_evals(models, treebanks, out_file, check_parse, print_freq_tasks)
|
||||||
|
|
|
@ -303,7 +303,9 @@ def get_token_conllu(token, i):
|
||||||
feat_str = []
|
feat_str = []
|
||||||
replacements = {"one": "1", "two": "2", "three": "3"}
|
replacements = {"one": "1", "two": "2", "three": "3"}
|
||||||
for feat in features:
|
for feat in features:
|
||||||
if not feat.startswith("begin") and not feat.startswith("end"):
|
if "=" in feat:
|
||||||
|
feat_str.append(feat)
|
||||||
|
elif not feat.startswith("begin") and not feat.startswith("end"):
|
||||||
key, value = feat.split("_", 1)
|
key, value = feat.split("_", 1)
|
||||||
value = replacements.get(value, value)
|
value = replacements.get(value, value)
|
||||||
feat_str.append("%s=%s" % (key, value.title()))
|
feat_str.append("%s=%s" % (key, value.title()))
|
||||||
|
|
|
@ -135,9 +135,8 @@ def debug_data(
|
||||||
morph_rules = {}
|
morph_rules = {}
|
||||||
if morph_rules_path is not None:
|
if morph_rules_path is not None:
|
||||||
morph_rules = srsly.read_json(morph_rules_path)
|
morph_rules = srsly.read_json(morph_rules_path)
|
||||||
# Update tag map with provided mapping
|
# Replace tag map with provided mapping
|
||||||
nlp.vocab.morphology.tag_map.update(tag_map)
|
nlp.vocab.morphology.load_tag_map(tag_map)
|
||||||
|
|
||||||
# Load morph rules
|
# Load morph rules
|
||||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
||||||
|
|
||||||
|
|
|
@ -124,8 +124,8 @@ def train(
|
||||||
)
|
)
|
||||||
nlp.begin_training(lambda: train_examples)
|
nlp.begin_training(lambda: train_examples)
|
||||||
|
|
||||||
# Update tag map with provided mapping
|
# Replace tag map with provided mapping
|
||||||
nlp.vocab.morphology.tag_map.update(tag_map)
|
nlp.vocab.morphology.load_tag_map(tag_map)
|
||||||
|
|
||||||
# Load morph rules
|
# Load morph rules
|
||||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
||||||
|
|
|
@ -58,12 +58,23 @@ cdef class Morphology:
|
||||||
FEATURE_SEP = "|"
|
FEATURE_SEP = "|"
|
||||||
FIELD_SEP = "="
|
FIELD_SEP = "="
|
||||||
VALUE_SEP = ","
|
VALUE_SEP = ","
|
||||||
EMPTY_MORPH = "_"
|
EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0
|
||||||
|
|
||||||
def __init__(self, StringStore strings, tag_map, lemmatizer, exc=None):
|
def __init__(self, StringStore strings, tag_map, lemmatizer, exc=None):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.strings = strings
|
self.strings = strings
|
||||||
self.tags = PreshMap()
|
self.tags = PreshMap()
|
||||||
|
self.load_tag_map(tag_map)
|
||||||
|
self.lemmatizer = lemmatizer
|
||||||
|
|
||||||
|
self._cache = PreshMapArray(self.n_tags)
|
||||||
|
self._exc = {}
|
||||||
|
if exc is not None:
|
||||||
|
self.load_morph_exceptions(exc)
|
||||||
|
|
||||||
|
def load_tag_map(self, tag_map):
|
||||||
|
self.tag_map = {}
|
||||||
|
self.reverse_index = {}
|
||||||
# Add special space symbol. We prefix with underscore, to make sure it
|
# Add special space symbol. We prefix with underscore, to make sure it
|
||||||
# always sorts to the end.
|
# always sorts to the end.
|
||||||
if '_SP' in tag_map:
|
if '_SP' in tag_map:
|
||||||
|
@ -74,24 +85,14 @@ cdef class Morphology:
|
||||||
self.strings.add('_SP')
|
self.strings.add('_SP')
|
||||||
tag_map = dict(tag_map)
|
tag_map = dict(tag_map)
|
||||||
tag_map['_SP'] = space_attrs
|
tag_map['_SP'] = space_attrs
|
||||||
self.tag_names = tuple(sorted(tag_map.keys()))
|
|
||||||
self.tag_map = {}
|
|
||||||
self.lemmatizer = lemmatizer
|
|
||||||
self.n_tags = len(tag_map)
|
|
||||||
self.reverse_index = {}
|
|
||||||
self._load_from_tag_map(tag_map)
|
|
||||||
|
|
||||||
self._cache = PreshMapArray(self.n_tags)
|
|
||||||
self._exc = {}
|
|
||||||
if exc is not None:
|
|
||||||
self.load_morph_exceptions(exc)
|
|
||||||
|
|
||||||
def _load_from_tag_map(self, tag_map):
|
|
||||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||||
attrs = _normalize_props(attrs)
|
attrs = _normalize_props(attrs)
|
||||||
self.add(attrs)
|
self.add(attrs)
|
||||||
self.tag_map[tag_str] = dict(attrs)
|
self.tag_map[tag_str] = dict(attrs)
|
||||||
self.reverse_index[self.strings.add(tag_str)] = i
|
self.reverse_index[self.strings.add(tag_str)] = i
|
||||||
|
self.tag_names = tuple(sorted(self.tag_map.keys()))
|
||||||
|
self.n_tags = len(self.tag_map)
|
||||||
|
self._cache = PreshMapArray(self.n_tags)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
|
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
|
||||||
|
@ -114,13 +115,7 @@ cdef class Morphology:
|
||||||
if not isinstance(features, dict):
|
if not isinstance(features, dict):
|
||||||
warnings.warn(Warnings.W100.format(feature=features))
|
warnings.warn(Warnings.W100.format(feature=features))
|
||||||
features = {}
|
features = {}
|
||||||
features = _normalize_props(features)
|
|
||||||
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
|
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
|
||||||
# normalized UFEATS string with sorted fields and values
|
|
||||||
norm_feats_string = self.FEATURE_SEP.join(sorted([
|
|
||||||
self.FIELD_SEP.join([field, values])
|
|
||||||
for field, values in string_features.items()
|
|
||||||
]))
|
|
||||||
# intified ("Field", "Field=Value") pairs
|
# intified ("Field", "Field=Value") pairs
|
||||||
field_feature_pairs = []
|
field_feature_pairs = []
|
||||||
for field in sorted(string_features):
|
for field in sorted(string_features):
|
||||||
|
@ -134,6 +129,7 @@ cdef class Morphology:
|
||||||
# the hash key for the tag is either the hash of the normalized UFEATS
|
# the hash key for the tag is either the hash of the normalized UFEATS
|
||||||
# string or the hash of an empty placeholder (using the empty string
|
# string or the hash of an empty placeholder (using the empty string
|
||||||
# would give a hash key of 0, which is not good for PreshMap)
|
# would give a hash key of 0, which is not good for PreshMap)
|
||||||
|
norm_feats_string = self.normalize_features(features)
|
||||||
if norm_feats_string:
|
if norm_feats_string:
|
||||||
tag.key = self.strings.add(norm_feats_string)
|
tag.key = self.strings.add(norm_feats_string)
|
||||||
else:
|
else:
|
||||||
|
@ -141,6 +137,26 @@ cdef class Morphology:
|
||||||
self.insert(tag)
|
self.insert(tag)
|
||||||
return tag.key
|
return tag.key
|
||||||
|
|
||||||
|
def normalize_features(self, features):
|
||||||
|
"""Create a normalized UFEATS string from a features string or dict.
|
||||||
|
|
||||||
|
features (Union[dict, str]): Features as dict or UFEATS string.
|
||||||
|
RETURNS (str): Features as normalized UFEATS string.
|
||||||
|
"""
|
||||||
|
if isinstance(features, str):
|
||||||
|
features = self.feats_to_dict(features)
|
||||||
|
if not isinstance(features, dict):
|
||||||
|
warnings.warn(Warnings.W100.format(feature=features))
|
||||||
|
features = {}
|
||||||
|
features = _normalize_props(features)
|
||||||
|
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
|
||||||
|
# normalized UFEATS string with sorted fields and values
|
||||||
|
norm_feats_string = self.FEATURE_SEP.join(sorted([
|
||||||
|
self.FIELD_SEP.join([field, values])
|
||||||
|
for field, values in string_features.items()
|
||||||
|
]))
|
||||||
|
return norm_feats_string or self.EMPTY_MORPH
|
||||||
|
|
||||||
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
|
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
|
||||||
"""Creates a MorphAnalysisC from a list of intified
|
"""Creates a MorphAnalysisC from a list of intified
|
||||||
("Field", "Field=Value") tuples where fields with multiple values have
|
("Field", "Field=Value") tuples where fields with multiple values have
|
||||||
|
|
|
@ -23,29 +23,45 @@ from .defaults import default_morphologizer
|
||||||
@component("morphologizer", assigns=["token.morph", "token.pos"], default_model=default_morphologizer)
|
@component("morphologizer", assigns=["token.morph", "token.pos"], default_model=default_morphologizer)
|
||||||
class Morphologizer(Tagger):
|
class Morphologizer(Tagger):
|
||||||
|
|
||||||
|
POS_FEAT = "POS"
|
||||||
|
|
||||||
def __init__(self, vocab, model, **cfg):
|
def __init__(self, vocab, model, **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
self.cfg = dict(sorted(cfg.items()))
|
self.cfg = dict(sorted(cfg.items()))
|
||||||
self.cfg.setdefault("labels", {})
|
# to be able to set annotations without string operations on labels,
|
||||||
self.cfg.setdefault("morph_pos", {})
|
# store mappings from morph+POS labels to token-level annotations:
|
||||||
|
# 1) labels_morph stores a mapping from morph+POS->morph
|
||||||
|
self.cfg.setdefault("labels_morph", {})
|
||||||
|
# 2) labels_pos stores a mapping from morph+POS->POS
|
||||||
|
self.cfg.setdefault("labels_pos", {})
|
||||||
|
# add mappings for empty morph
|
||||||
|
self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
|
||||||
|
self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
return tuple(self.cfg["labels"].keys())
|
return tuple(self.cfg["labels_morph"].keys())
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
if not isinstance(label, str):
|
if not isinstance(label, str):
|
||||||
raise ValueError(Errors.E187)
|
raise ValueError(Errors.E187)
|
||||||
if label in self.labels:
|
if label in self.labels:
|
||||||
return 0
|
return 0
|
||||||
morph = Morphology.feats_to_dict(label)
|
# normalize label
|
||||||
norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)]
|
norm_label = self.vocab.morphology.normalize_features(label)
|
||||||
pos = morph.get("POS", "")
|
# extract separate POS and morph tags
|
||||||
if norm_morph_pos not in self.cfg["labels"]:
|
label_dict = Morphology.feats_to_dict(label)
|
||||||
self.cfg["labels"][norm_morph_pos] = norm_morph_pos
|
pos = label_dict.get(self.POS_FEAT, "")
|
||||||
self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos]
|
if self.POS_FEAT in label_dict:
|
||||||
|
label_dict.pop(self.POS_FEAT)
|
||||||
|
# normalize morph string and add to morphology table
|
||||||
|
norm_morph = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
||||||
|
# add label mappings
|
||||||
|
if norm_label not in self.cfg["labels_morph"]:
|
||||||
|
self.cfg["labels_morph"][norm_label] = norm_morph
|
||||||
|
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
|
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
|
||||||
|
@ -53,14 +69,16 @@ class Morphologizer(Tagger):
|
||||||
for example in get_examples():
|
for example in get_examples():
|
||||||
for i, token in enumerate(example.reference):
|
for i, token in enumerate(example.reference):
|
||||||
pos = token.pos_
|
pos = token.pos_
|
||||||
morph = token.morph
|
morph = token.morph_
|
||||||
norm_morph = self.vocab.strings[self.vocab.morphology.add(morph)]
|
# create and add the combined morph+POS label
|
||||||
|
morph_dict = Morphology.feats_to_dict(morph)
|
||||||
if pos:
|
if pos:
|
||||||
morph["POS"] = pos
|
morph_dict[self.POS_FEAT] = pos
|
||||||
norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)]
|
norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
|
||||||
if norm_morph_pos not in self.cfg["labels"]:
|
# add label->morph and label->POS mappings
|
||||||
self.cfg["labels"][norm_morph_pos] = norm_morph
|
if norm_label not in self.cfg["labels_morph"]:
|
||||||
self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos]
|
self.cfg["labels_morph"][norm_label] = morph
|
||||||
|
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
||||||
self.set_output(len(self.labels))
|
self.set_output(len(self.labels))
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
|
@ -79,8 +97,8 @@ class Morphologizer(Tagger):
|
||||||
doc_tag_ids = doc_tag_ids.get()
|
doc_tag_ids = doc_tag_ids.get()
|
||||||
for j, tag_id in enumerate(doc_tag_ids):
|
for j, tag_id in enumerate(doc_tag_ids):
|
||||||
morph = self.labels[tag_id]
|
morph = self.labels[tag_id]
|
||||||
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels"][morph])
|
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
|
||||||
doc.c[j].pos = self.cfg["morph_pos"][morph]
|
doc.c[j].pos = self.cfg["labels_pos"][morph]
|
||||||
|
|
||||||
doc.is_morphed = True
|
doc.is_morphed = True
|
||||||
|
|
||||||
|
@ -94,14 +112,17 @@ class Morphologizer(Tagger):
|
||||||
for i in range(len(morphs)):
|
for i in range(len(morphs)):
|
||||||
pos = pos_tags[i]
|
pos = pos_tags[i]
|
||||||
morph = morphs[i]
|
morph = morphs[i]
|
||||||
feats = Morphology.feats_to_dict(morph)
|
# POS may align (same value for multiple tokens) when morph
|
||||||
|
# doesn't, so if either is None, treat both as None here so that
|
||||||
|
# truths doesn't end up with an unknown morph+POS combination
|
||||||
|
if pos is None or morph is None:
|
||||||
|
pos = None
|
||||||
|
morph = None
|
||||||
|
label_dict = Morphology.feats_to_dict(morph)
|
||||||
if pos:
|
if pos:
|
||||||
feats["POS"] = pos
|
label_dict[self.POS_FEAT] = pos
|
||||||
if len(feats) > 0:
|
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
||||||
morph = self.vocab.strings[self.vocab.morphology.add(feats)]
|
eg_truths.append(label)
|
||||||
if morph == "":
|
|
||||||
morph = Morphology.EMPTY_MORPH
|
|
||||||
eg_truths.append(morph)
|
|
||||||
truths.append(eg_truths)
|
truths.append(eg_truths)
|
||||||
d_scores, loss = loss_func(scores, truths)
|
d_scores, loss = loss_func(scores, truths)
|
||||||
if self.model.ops.xp.isnan(loss):
|
if self.model.ops.xp.isnan(loss):
|
||||||
|
|
|
@ -5,6 +5,7 @@ from spacy.gold import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.tests.util import make_tempdir
|
from spacy.tests.util import make_tempdir
|
||||||
|
from spacy.morphology import Morphology
|
||||||
|
|
||||||
|
|
||||||
def test_label_types():
|
def test_label_types():
|
||||||
|
@ -23,9 +24,10 @@ TRAIN_DATA = [
|
||||||
"pos": ["NOUN", "VERB", "ADJ", "NOUN"],
|
"pos": ["NOUN", "VERB", "ADJ", "NOUN"],
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
|
# test combinations of morph+POS
|
||||||
(
|
(
|
||||||
"Eat blue ham",
|
"Eat blue ham",
|
||||||
{"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]},
|
{"morphs": ["Feat=V", "", ""], "pos": ["", "ADJ", ""]},
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -38,7 +40,12 @@ def test_overfitting_IO():
|
||||||
for inst in TRAIN_DATA:
|
for inst in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
|
||||||
for morph, pos in zip(inst[1]["morphs"], inst[1]["pos"]):
|
for morph, pos in zip(inst[1]["morphs"], inst[1]["pos"]):
|
||||||
morphologizer.add_label(morph + "|POS=" + pos)
|
if morph and pos:
|
||||||
|
morphologizer.add_label(morph + Morphology.FEATURE_SEP + "POS" + Morphology.FIELD_SEP + pos)
|
||||||
|
elif pos:
|
||||||
|
morphologizer.add_label("POS" + Morphology.FIELD_SEP + pos)
|
||||||
|
elif morph:
|
||||||
|
morphologizer.add_label(morph)
|
||||||
nlp.add_pipe(morphologizer)
|
nlp.add_pipe(morphologizer)
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
|
|
||||||
|
@ -48,19 +55,27 @@ def test_overfitting_IO():
|
||||||
assert losses["morphologizer"] < 0.00001
|
assert losses["morphologizer"] < 0.00001
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
test_text = "I like blue eggs"
|
test_text = "I like blue ham"
|
||||||
doc = nlp(test_text)
|
doc = nlp(test_text)
|
||||||
gold_morphs = [
|
gold_morphs = [
|
||||||
"Feat=N|POS=NOUN",
|
"Feat=N",
|
||||||
"Feat=V|POS=VERB",
|
"Feat=V",
|
||||||
"Feat=J|POS=ADJ",
|
"",
|
||||||
"Feat=N|POS=NOUN",
|
"",
|
||||||
|
]
|
||||||
|
gold_pos_tags = [
|
||||||
|
"NOUN",
|
||||||
|
"VERB",
|
||||||
|
"ADJ",
|
||||||
|
"",
|
||||||
]
|
]
|
||||||
assert [t.morph_ for t in doc] == gold_morphs
|
assert [t.morph_ for t in doc] == gold_morphs
|
||||||
|
assert [t.pos_ for t in doc] == gold_pos_tags
|
||||||
|
|
||||||
# Also test the results are still the same after IO
|
# Also test the results are still the same after IO
|
||||||
with make_tempdir() as tmp_dir:
|
with make_tempdir() as tmp_dir:
|
||||||
nlp.to_disk(tmp_dir)
|
nlp.to_disk(tmp_dir)
|
||||||
nlp2 = util.load_model_from_path(tmp_dir)
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
doc2 = nlp2(test_text)
|
doc2 = nlp2(test_text)
|
||||||
assert gold_morphs == [t.morph_ for t in doc2]
|
assert [t.morph_ for t in doc2] == gold_morphs
|
||||||
|
assert [t.pos_ for t in doc2] == gold_pos_tags
|
||||||
|
|
|
@ -31,6 +31,7 @@ def test_overfitting_IO():
|
||||||
nlp.vocab.morphology.load_tag_map(TAG_MAP)
|
nlp.vocab.morphology.load_tag_map(TAG_MAP)
|
||||||
nlp.vocab.morphology.load_morph_exceptions(MORPH_RULES)
|
nlp.vocab.morphology.load_morph_exceptions(MORPH_RULES)
|
||||||
tagger = nlp.create_pipe("tagger")
|
tagger = nlp.create_pipe("tagger")
|
||||||
|
nlp.vocab.morphology.load_tag_map(TAG_MAP)
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
|
|
|
@ -19,6 +19,7 @@ const isNightly = !!+process.env.SPACY_NIGHTLY || site.nightlyBranches.includes(
|
||||||
const favicon = isNightly ? `src/images/icon_nightly.png` : `src/images/icon.png`
|
const favicon = isNightly ? `src/images/icon_nightly.png` : `src/images/icon.png`
|
||||||
const binderBranch = isNightly ? 'nightly' : site.binderBranch
|
const binderBranch = isNightly ? 'nightly' : site.binderBranch
|
||||||
const siteUrl = isNightly ? site.siteUrlNightly : site.siteUrl
|
const siteUrl = isNightly ? site.siteUrlNightly : site.siteUrl
|
||||||
|
const domain = isNightly ? site.domainNightly : site.domain
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
siteMetadata: {
|
siteMetadata: {
|
||||||
|
@ -148,6 +149,10 @@ module.exports = {
|
||||||
respectDNT: true,
|
respectDNT: true,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
resolve: `gatsby-plugin-plausible`,
|
||||||
|
options: { domain },
|
||||||
|
},
|
||||||
{
|
{
|
||||||
resolve: 'gatsby-plugin-robots-txt',
|
resolve: 'gatsby-plugin-robots-txt',
|
||||||
options: {
|
options: {
|
||||||
|
|
|
@ -3,7 +3,9 @@
|
||||||
"description": "spaCy is a free open-source library for Natural Language Processing in Python. It features NER, POS tagging, dependency parsing, word vectors and more.",
|
"description": "spaCy is a free open-source library for Natural Language Processing in Python. It features NER, POS tagging, dependency parsing, word vectors and more.",
|
||||||
"slogan": "Industrial-strength Natural Language Processing in Python",
|
"slogan": "Industrial-strength Natural Language Processing in Python",
|
||||||
"siteUrl": "https://spacy.io",
|
"siteUrl": "https://spacy.io",
|
||||||
|
"domain": "spacy.io",
|
||||||
"siteUrlNightly": "https://nightly.spacy.io",
|
"siteUrlNightly": "https://nightly.spacy.io",
|
||||||
|
"domainNightly": "nightly.spacy.io",
|
||||||
"nightlyBranches": ["nightly.spacy.io"],
|
"nightlyBranches": ["nightly.spacy.io"],
|
||||||
"email": "contact@explosion.ai",
|
"email": "contact@explosion.ai",
|
||||||
"company": "Explosion",
|
"company": "Explosion",
|
||||||
|
|
35
website/package-lock.json
generated
35
website/package-lock.json
generated
|
@ -12915,6 +12915,41 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"gatsby-plugin-plausible": {
|
||||||
|
"version": "0.0.6",
|
||||||
|
"resolved": "https://registry.npmjs.org/gatsby-plugin-plausible/-/gatsby-plugin-plausible-0.0.6.tgz",
|
||||||
|
"integrity": "sha512-qUdPQ3haeX2DIywGZ2boMpmFAnSbWzqS9cG9/OO0mWLigA0sDLWwGkpHIAvrfepgbB9U/roLtXflctBwOIxtcQ==",
|
||||||
|
"requires": {
|
||||||
|
"@babel/runtime": "^7.9.2",
|
||||||
|
"minimatch": "3.0.4",
|
||||||
|
"react": "^16.13.1"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"@babel/runtime": {
|
||||||
|
"version": "7.10.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.10.5.tgz",
|
||||||
|
"integrity": "sha512-otddXKhdNn7d0ptoFRHtMLa8LqDxLYwTjB4nYgM1yy5N6gU/MUf8zqyyLltCH3yAVitBzmwK4us+DD0l/MauAg==",
|
||||||
|
"requires": {
|
||||||
|
"regenerator-runtime": "^0.13.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"react": {
|
||||||
|
"version": "16.13.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/react/-/react-16.13.1.tgz",
|
||||||
|
"integrity": "sha512-YMZQQq32xHLX0bz5Mnibv1/LHb3Sqzngu7xstSM+vrkE5Kzr9xE0yMByK5kMoTK30YVJE61WfbxIFFvfeDKT1w==",
|
||||||
|
"requires": {
|
||||||
|
"loose-envify": "^1.1.0",
|
||||||
|
"object-assign": "^4.1.1",
|
||||||
|
"prop-types": "^15.6.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"regenerator-runtime": {
|
||||||
|
"version": "0.13.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.5.tgz",
|
||||||
|
"integrity": "sha512-ZS5w8CpKFinUzOwW3c83oPeVXoNsrLsaCoLtJvAClH135j/R77RuymhiSErhm2lKcwSCIpmvIWSbDkIfAqKQlA=="
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"gatsby-plugin-react-helmet": {
|
"gatsby-plugin-react-helmet": {
|
||||||
"version": "3.0.6",
|
"version": "3.0.6",
|
||||||
"resolved": "https://registry.npmjs.org/gatsby-plugin-react-helmet/-/gatsby-plugin-react-helmet-3.0.6.tgz",
|
"resolved": "https://registry.npmjs.org/gatsby-plugin-react-helmet/-/gatsby-plugin-react-helmet-3.0.6.tgz",
|
||||||
|
|
|
@ -23,6 +23,7 @@
|
||||||
"gatsby-plugin-google-analytics": "^2.0.14",
|
"gatsby-plugin-google-analytics": "^2.0.14",
|
||||||
"gatsby-plugin-manifest": "^2.0.17",
|
"gatsby-plugin-manifest": "^2.0.17",
|
||||||
"gatsby-plugin-offline": "^2.0.24",
|
"gatsby-plugin-offline": "^2.0.24",
|
||||||
|
"gatsby-plugin-plausible": "0.0.6",
|
||||||
"gatsby-plugin-react-helmet": "^3.0.6",
|
"gatsby-plugin-react-helmet": "^3.0.6",
|
||||||
"gatsby-plugin-react-svg": "^2.0.0",
|
"gatsby-plugin-react-svg": "^2.0.0",
|
||||||
"gatsby-plugin-robots-txt": "^1.5.1",
|
"gatsby-plugin-robots-txt": "^1.5.1",
|
||||||
|
|
Loading…
Reference in New Issue
Block a user