From 052c45dc2ff1c73ad208ae17c6a3cd6faefbd1f5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Mon, 24 Sep 2018 15:25:20 +0200 Subject: [PATCH 001/207] Add as_int and as_string methods to StringStore --- spacy/strings.pyx | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index b54e3f155..4c6b5e1bb 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -98,9 +98,7 @@ cdef class StringStore: return u'' elif string_or_id in SYMBOLS_BY_STR: return SYMBOLS_BY_STR[string_or_id] - cdef hash_t key - if isinstance(string_or_id, unicode): key = hash_string(string_or_id) return key @@ -118,6 +116,20 @@ cdef class StringStore: else: return decode_Utf8Str(utf8str) + def as_int(self, key): + """If key is an int, return it; otherwise, get the int value.""" + if not isinstance(key, basestring): + return key + else: + return self[key] + + def as_string(self, key): + """If key is a string, return it; otherwise, get the string value.""" + if isinstance(key, basestring): + return key + else: + return self[key] + def add(self, string): """Add a string to the StringStore. From b10d0cce05ee6ff90362f0571ae386ab03da01ad Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Mon, 24 Sep 2018 17:35:28 +0200 Subject: [PATCH 002/207] Add MultiSoftmax class Add a new class for the Tagger model, MultiSoftmax. This allows softmax prediction of multiple classes on the same output layer, e.g. one variable with 3 classes, another with 4 classes. This makes a layer with 7 output neurons, which we softmax into two distributions. --- spacy/_ml.py | 44 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 964b1fa7a..231f6a7a4 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -444,7 +444,46 @@ def getitem(i): return layerize(getitem_fwd) -def build_tagger_model(nr_class, **cfg): +@describe.attributes( + W=Synapses("Weights matrix", + lambda obj: (obj.nO, obj.nI), + lambda W, ops: None) +) +class MultiSoftmax(Affine): + '''Neural network layer that predicts several multi-class attributes at once. + For instance, we might predict one class with 6 variables, and another with 5. + We predict the 11 neurons required for this, and then softmax them such + that columns 0-6 make a probability distribution and coumns 6-11 make another. + ''' + name = 'multisoftmax' + + def __init__(self, out_sizes, nI=None, **kwargs): + Model.__init__(self, **kwargs) + self.out_sizes = out_sizes + self.nO = sum(out_sizes) + self.nI = nI + + def predict(self, input__BI): + output__BO = self.ops.affine(self.W, self.b, input__BI) + i = 0 + for out_size in self.out_sizes: + self.ops.softmax(output__BO[:, i : i+out_size], inplace=True) + i += out_size + return output__BO + + def begin_update(self, input__BI, drop=0.): + output__BO = self.predict(input__BI) + def finish_update(grad__BO, sgd=None): + self.d_W += self.ops.gemm(grad__BO, input__BI, trans1=True) + self.d_b += grad__BO.sum(axis=0) + grad__BI = self.ops.gemm(grad__BO, self.W) + if sgd is not None: + sgd(self._mem.weights, self._mem.gradient, key=self.id) + return grad__BI + return output__BO, finish_update + + +def build_tagger_model(class_nums, **cfg): embed_size = util.env_opt('embed_size', 7000) if 'token_vector_width' in cfg: token_vector_width = cfg['token_vector_width'] @@ -459,7 +498,8 @@ def build_tagger_model(nr_class, **cfg): tok2vec = Tok2Vec(token_vector_width, embed_size, subword_features=subword_features, pretrained_vectors=pretrained_vectors) - softmax = with_flatten(Softmax(nr_class, token_vector_width)) + softmax = with_flatten( + MultiSoftmax(class_nums, token_vector_width)) model = ( tok2vec >> softmax From ac5742223ae4c1d094dde4bd98dbbf51e7b19e5d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Mon, 24 Sep 2018 23:14:06 +0200 Subject: [PATCH 003/207] Draft class to predict morphological tags --- spacy/_morphologizer.pyx | 131 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 spacy/_morphologizer.pyx diff --git a/spacy/_morphologizer.pyx b/spacy/_morphologizer.pyx new file mode 100644 index 000000000..ca857296e --- /dev/null +++ b/spacy/_morphologizer.pyx @@ -0,0 +1,131 @@ +from __future__ import unicode_literals +from collections import OrderedDict, defaultdict +import cytoolz +import ujson + +import numpy +cimport numpy as np +from .util import msgpack +from .util import msgpack_numpy + +from thinc.api import chain +from thinc.neural.util import to_categorical, copy_array +from . import util +from .pipe import Pipe +from ._ml import Tok2Vec, build_tagger_model +from ._ml import link_vectors_to_models, zero_init, flatten +from ._ml import create_default_optimizer +from .errors import Errors, TempErrors +from .compat import json_dumps, basestring_ +from .tokens.doc cimport Doc +from .vocab cimport Vocab +from .morphology cimport Morphology + + +class Morphologizer(Pipe): + name = 'morphologizer' + + @classmethod + def Model(cls, attr_nums, **cfg): + if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'): + raise ValueError(TempErrors.T008) + return build_morphologizer_model(attr_nums, **cfg) + + def __init__(self, vocab, model=True, **cfg): + self.vocab = vocab + self.model = model + self.cfg = OrderedDict(sorted(cfg.items())) + self.cfg.setdefault('cnn_maxout_pieces', 2) + + @property + def labels(self): + return self.vocab.morphology.tag_names + + @property + def tok2vec(self): + if self.model in (None, True, False): + return None + else: + return chain(self.model.tok2vec, flatten) + + def __call__(self, doc): + features, tokvecs = self.predict([doc]) + self.set_annotations([doc], tags, tensors=tokvecs) + return doc + + def pipe(self, stream, batch_size=128, n_threads=-1): + for docs in cytoolz.partition_all(batch_size, stream): + docs = list(docs) + features, tokvecs = self.predict(docs) + self.set_annotations(docs, features, tensors=tokvecs) + yield from docs + + def predict(self, docs): + if not any(len(doc) for doc in docs): + # Handle case where there are no tokens in any docs. + n_labels = self.model.nO + guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs] + tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO)) + return guesses, tokvecs + tokvecs = self.model.tok2vec(docs) + scores = self.model.softmax(tokvecs) + guesses = [] + # Resolve multisoftmax into guesses + for doc_scores in scores: + guesses.append(scores_to_guesses(doc_scores, self.model.softmax.out_sizes)) + return guesses, tokvecs + + def set_annotations(self, docs, batch_feature_ids, tensors=None): + if isinstance(docs, Doc): + docs = [docs] + cdef Doc doc + cdef Vocab vocab = self.vocab + for i, doc in enumerate(docs): + doc_feat_ids = batch_feat_ids[i] + if hasattr(doc_feat_ids, 'get'): + doc_feat_ids = doc_feat_ids.get() + # Convert the neuron indices into feature IDs. + offset = self.vocab.morphology.first_feature + for j, nr_feat in enumerate(self.model.softmax.out_sizes): + doc_feat_ids[:, j] += offset + offset += nr_feat + # Now add the analysis, and set the hash. + for j in range(doc_feat_ids.shape[0]): + doc.c[j].morph = self.vocab.morphology.add(doc_feat_ids[j]) + + def update(self, docs, golds, drop=0., sgd=None, losses=None): + if losses is not None and self.name not in losses: + losses[self.name] = 0. + + tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop) + loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) + bp_tag_scores(d_tag_scores, sgd=sgd) + + if losses is not None: + losses[self.name] += loss + + def get_loss(self, docs, golds, scores): + guesses = [] + for doc_scores in scores: + guesses.append(scores_to_guesses(doc_scores, self.model.softmax.out_sizes)) + guesses = self.model.ops.flatten(guesses) + cdef int idx = 0 + target = numpy.zeros(scores.shape, dtype='f') + for gold in golds: + for features in gold.morphology: + if features is None: + target[idx] = guesses[idx] + else: + for feature in features: + column = feature_to_column(feature) # TODO + target[idx, column] = 1 + idx += 1 + target = self.model.ops.xp.array(target, dtype='f') + d_scores = scores - target + loss = (d_scores**2).sum() + d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) + return float(loss), d_scores + + def use_params(self, params): + with self.model.use_params(params): + yield From 6ae645c4ef67449992463979c5539118a3699a5e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Mon, 24 Sep 2018 23:57:41 +0200 Subject: [PATCH 004/207] WIP on supporting morphology features --- spacy/morphology.pxd | 81 ++++++--- spacy/morphology.pyx | 402 +++++++++++++++++++++++++++++++++---------- 2 files changed, 366 insertions(+), 117 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index d0110b300..2220cfcfc 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -1,48 +1,30 @@ from cymem.cymem cimport Pool -from preshed.maps cimport PreshMapArray +from preshed.maps cimport PreshMap from libc.stdint cimport uint64_t +from murmurhash cimport mrmr from .structs cimport TokenC from .strings cimport StringStore -from .typedefs cimport attr_t, flags_t +from .typedefs cimport hash_t, attr_t, flags_t from .parts_of_speech cimport univ_pos_t from . cimport symbols - -cdef struct RichTagC: - uint64_t morph - int id - univ_pos_t pos - attr_t name - - -cdef struct MorphAnalysisC: - RichTagC tag - attr_t lemma - - cdef class Morphology: cdef readonly Pool mem cdef readonly StringStore strings + cdef PreshMap tags # Keyed by hash, value is pointer to tag + cdef public object lemmatizer cdef readonly object tag_map - cdef public object n_tags - cdef public object reverse_index - cdef public object tag_names - cdef public object exc - - cdef RichTagC* rich_tags - cdef PreshMapArray _cache + cdef hash_t insert(self, RichTagC tag) except 0 + cdef int assign_untagged(self, TokenC* token) except -1 - cdef int assign_tag(self, TokenC* token, tag) except -1 - cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 - - cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 - + cdef update_token_morph(self, TokenC* token, features) + cdef set_token_morph(self, TokenC* token, pos, features) cdef enum univ_morph_t: NIL = 0 @@ -298,4 +280,47 @@ cdef enum univ_morph_t: VerbType_mod # U VerbType_light # U - +cdef struct RichTagC: + univ_pos_t pos + + univ_morph_t abbr + univ_morph_t adp_type + univ_morph_t adv_type + univ_morph_t animacy + univ_morph_t aspect + univ_morph_t case + univ_morph_t conj_type + univ_morph_t connegative + univ_morph_t definite + univ_morph_t degree + univ_morph_t derivation + univ_morph_t echo + univ_morph_t foreign + univ_morph_t gender + univ_morph_t hyph + univ_morph_t inf_form + univ_morph_t mood + univ_morph_t negative + univ_morph_t number + univ_morph_t name_type + univ_morph_t num_form + univ_morph_t num_type + univ_morph_t num_value + univ_morph_t part_form + univ_morph_t part_type + univ_morph_t person + univ_morph_t polite + univ_morph_t polarity + univ_morph_t poss + univ_morph_t prefix + univ_morph_t prep_case + univ_morph_t pron_type + univ_morph_t punct_side + univ_morph_t punct_type + univ_morph_t reflex + univ_morph_t style + univ_morph_t style_variant + univ_morph_t tense + univ_morph_t verb_form + univ_morph_t voice + univ_morph_t verb_type diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index bd821d76f..3b74ecaae 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -3,6 +3,7 @@ from __future__ import unicode_literals from libc.string cimport memset +import ujson as json from .attrs cimport POS, IS_SPACE from .attrs import LEMMA, intify_attrs @@ -12,6 +13,7 @@ from .lexeme cimport Lexeme from .errors import Errors + def _normalize_props(props): """Transform deprecated string keys to correct names.""" out = {} @@ -32,9 +34,17 @@ def _normalize_props(props): cdef class Morphology: + '''Store the possible morphological analyses for a language, and index them + by hash. + + To save space on each token, tokens only know the hash of their morphological + analysis, so queries of morphological attributes are delegated + to this class. + ''' def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None): self.mem = Pool() self.strings = string_store + self.tags = PreshMap() # Add special space symbol. We prefix with underscore, to make sure it # always sorts to the end. space_attrs = tag_map.get('SP', {POS: SPACE}) @@ -47,32 +57,46 @@ cdef class Morphology: self.lemmatizer = lemmatizer self.n_tags = len(tag_map) self.reverse_index = {} - - self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC)) for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): - self.strings.add(tag_str) self.tag_map[tag_str] = dict(attrs) - attrs = _normalize_props(attrs) - attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) - self.rich_tags[i].id = i - self.rich_tags[i].name = self.strings.add(tag_str) - self.rich_tags[i].morph = 0 - self.rich_tags[i].pos = attrs[POS] - self.reverse_index[self.rich_tags[i].name] = i - # Add a 'null' tag, which we can reference when assign morphology to - # untagged tokens. - self.rich_tags[self.n_tags].id = self.n_tags + self.reverse_index[i] = self.strings.add(tag_str) self._cache = PreshMapArray(self.n_tags) self.exc = {} if exc is not None: for (tag_str, orth_str), attrs in exc.items(): self.add_special_case(tag_str, orth_str, attrs) + + def add(self, features): + """Insert a morphological analysis in the morphology table, if not already + present. Returns the hash of the new analysis. + """ + features = intify_features(self.strings, features) + cdef RichTagC tag = create_rich_tag(features) + cdef hash_t key = self.insert(tag) + return key - def __reduce__(self): - return (Morphology, (self.strings, self.tag_map, self.lemmatizer, - self.exc), None, None) - + def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): + if orth not in self.strings: + return orth + cdef unicode py_string = self.strings[orth] + if self.lemmatizer is None: + return self.strings.add(py_string.lower()) + cdef list lemma_strings + cdef unicode lemma_string + lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) + lemma_string = lemma_strings[0] + lemma = self.strings.add(lemma_string) + return lemma + + cdef hash_t insert(self, RichTagC tag) except 0: + cdef hash_t key = hash_tag(tag) + if self.tags.get(key) == NULL: + tag_ptr = <RichTagC*>self.mem.alloc(1, sizeof(RichTagC)) + tag_ptr[0] = tag + self.tags.set(key, <void*>tag_ptr) + return key + cdef int assign_untagged(self, TokenC* token) except -1: """Set morphological attributes on a token without a POS tag. Uses the lemmatizer's lookup() method, which looks up the string in the @@ -101,84 +125,284 @@ cdef class Morphology: # figure out why the statistical model fails. Related to Issue #220 if Lexeme.c_check_flag(token.lex, IS_SPACE): tag_id = self.reverse_index[self.strings.add('_SP')] - rich_tag = self.rich_tags[tag_id] - analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth) - if analysis is NULL: - analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC)) - tag_str = self.strings[self.rich_tags[tag_id].name] - analysis.tag = rich_tag - analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth, - self.tag_map.get(tag_str, {})) - self._cache.set(tag_id, token.lex.orth, analysis) - token.lemma = analysis.lemma - token.pos = analysis.tag.pos - token.tag = analysis.tag.name - token.morph = analysis.tag.morph + lemma = <attr_t>self._cache.get(tag_id, token.lex.orth) + if lemma == 0: + tag_str = self.tag_names[tag_id] + features = dict(self.tag_map.get(tag_str, {})) + pos = self.strings.as_int(features.pop('POS')) + lemma = self.lemmatize(pos, token.lex.orth, features) + self._cache.set(tag_id, token.lex.orth, lemma) + token.lemma = lemma + token.pos = pos + token.tag = self.strings[tag_str] + token.morph = self.add(attrs) - cdef int assign_feature(self, uint64_t* flags, univ_morph_t flag_id, bint value) except -1: - cdef flags_t one = 1 - if value: - flags[0] |= one << flag_id - else: - flags[0] &= ~(one << flag_id) + cdef update_morph(self, hash_t morph, features): + """Update a morphological analysis with new feature values.""" + tag = (<RichTagC*>self.tags.get(morph))[0] + cdef univ_morph_t feature + cdef int value + for feature_, value in features.items(): + feature = self.strings.as_int(feature_) + set_feature(&tag, feature, 1) + morph = self.insert_tag(tag) + return morph - def add_special_case(self, unicode tag_str, unicode orth_str, attrs, - force=False): - """Add a special-case rule to the morphological analyser. Tokens whose - tag and orth match the rule will receive the specified properties. + def to_bytes(self): + json_tags = [] + for key in self.tags: + tag_ptr = <RichTagC*>self.tags.get(key) + if tag_ptr != NULL: + json_tags.append(tag_to_json(tag_ptr[0])) + raise json.dumps(json_tags) - tag (unicode): The part-of-speech tag to key the exception. - orth (unicode): The word-form to key the exception. - """ - # TODO: Currently we've assumed that we know the number of tags -- - # RichTagC is an array, and _cache is a PreshMapArray - # This is really bad: it makes the morphology typed to the tagger - # classes, which is all wrong. - self.exc[(tag_str, orth_str)] = dict(attrs) - tag = self.strings.add(tag_str) - if tag not in self.reverse_index: - return - tag_id = self.reverse_index[tag] - orth = self.strings[orth_str] - cdef RichTagC rich_tag = self.rich_tags[tag_id] - attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) - cached = <MorphAnalysisC*>self._cache.get(tag_id, orth) - if cached is NULL: - cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC)) - elif force: - memset(cached, 0, sizeof(cached[0])) - else: - raise ValueError(Errors.E015.format(tag=tag_str, orth=orth_str)) + def from_bytes(self, byte_string): + raise NotImplementedError - cached.tag = rich_tag - # TODO: Refactor this to take arbitrary attributes. - for name_id, value_id in attrs.items(): - if name_id == LEMMA: - cached.lemma = value_id - else: - self.assign_feature(&cached.tag.morph, name_id, value_id) - if cached.lemma == 0: - cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs) - self._cache.set(tag_id, orth, <void*>cached) + def to_disk(self, path): + raise NotImplementedError + + def from_disk(self, path): + raise NotImplementedError + + +cpdef univ_pos_t get_int_tag(pos_): + return <univ_pos_t>0 + +cpdef intify_features(StringStore strings, features): + return {strings.as_int(feature) for feature in features} + +cdef hash_t hash_tag(RichTagC tag) nogil: + return mrmr.hash64(&tag, sizeof(tag), 0) + +cdef RichTagC create_rich_tag(pos_, features): + cdef RichTagC tag + cdef univ_morph_t feature + tag.pos = get_int_tag(pos_) + for feature in features: + set_feature(&tag, feature, 1) + return tag + +cdef tag_to_json(RichTagC tag): + return {} + +cdef RichTagC tag_from_json(json_tag): + cdef RichTagC tag + return tag + +cdef int set_feature(RichTagC* tag, univ_morph_t feature, int value) nogil: + if value == True: + value_ = feature + else: + value_ = NIL + if feature == NIL: + pass + if is_abbr_feature(feature): + tag.abbr = value_ + elif is_adp_type_feature(feature): + tag.adp_type = value_ + elif is_adv_type_feature(feature): + tag.adv_type = value_ + elif is_animacy_feature(feature): + tag.animacy = value_ + elif is_aspect_feature(feature): + tag.aspect = value_ + elif is_case_feature(feature): + tag.case = value_ + elif is_conj_type_feature(feature): + tag.conj_type = value_ + elif is_connegative_feature(feature): + tag.connegative = value_ + elif is_definite_feature(feature): + tag.definite = value_ + elif is_degree_feature(feature): + tag.degree = value_ + elif is_derivation_feature(feature): + tag.derivation = value_ + elif is_echo_feature(feature): + tag.echo = value_ + elif is_foreign_feature(feature): + tag.foreign = value_ + elif is_gender_feature(feature): + tag.gender = value_ + elif is_hyph_feature(feature): + tag.hyph = value_ + elif is_inf_form_feature(feature): + tag.inf_form = value_ + elif is_mood_feature(feature): + tag.mood = value_ + elif is_negative_feature(feature): + tag.negative = value_ + elif is_number_feature(feature): + tag.number = value_ + elif is_name_type_feature(feature): + tag.name_type = value_ + elif is_num_form_feature(feature): + tag.num_form = value_ + elif is_num_value_feature(feature): + tag.num_value = value_ + elif is_part_form_feature(feature): + tag.part_form = value_ + elif is_part_type_feature(feature): + tag.part_type = value_ + elif is_person_feature(feature): + tag.person = value_ + elif is_polite_feature(feature): + tag.polite = value_ + elif is_polarity_feature(feature): + tag.polarity = value_ + elif is_poss_feature(feature): + tag.poss = value_ + elif is_prefix_feature(feature): + tag.prefix = value_ + elif is_prep_case_feature(feature): + tag.prep_case = value_ + elif is_pron_type_feature(feature): + tag.pron_type = value_ + elif is_punct_side_feature(feature): + tag.punct_type = value_ + elif is_reflex_feature(feature): + tag.reflex = value_ + elif is_style_feature(feature): + tag.style = value_ + elif is_style_variant_feature(feature): + tag.style_variant = value_ + elif is_tense_feature(feature): + tag.tense = value_ + elif is_verb_form_feature(feature): + tag.verb_form = value_ + elif is_voice_feature(feature): + tag.voice = value_ + elif is_verb_type_feature(feature): + tag.verb_type = value_ + else: + with gil: + raise ValueError("Unknown feature: %d" % feature) + +cdef int is_abbr_feature(univ_morph_t abbr) nogil: + return 0 + +cdef int is_adp_type_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_adv_type_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_animacy_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_aspect_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_case_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_conj_type_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_connegative_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_definite_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_degree_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_derivation_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_echo_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_foreign_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_gender_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_hyph_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_inf_form_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_mood_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_negative_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_number_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_name_type_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_num_form_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_num_type_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_num_value_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_part_form_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_part_type_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_person_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_polite_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_polarity_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_poss_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_prefix_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_prep_case_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_pron_type_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_punct_side_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_punct_type_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_reflex_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_style_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_style_variant_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_tense_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_verb_form_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_voice_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_verb_type_feature(univ_morph_t feature) nogil: + return 0 - def load_morph_exceptions(self, dict exc): - # Map (form, pos) to (lemma, rich tag) - for tag_str, entries in exc.items(): - for form_str, attrs in entries.items(): - self.add_special_case(tag_str, form_str, attrs) - def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): - if orth not in self.strings: - return orth - cdef unicode py_string = self.strings[orth] - if self.lemmatizer is None: - return self.strings.add(py_string.lower()) - cdef list lemma_strings - cdef unicode lemma_string - lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) - lemma_string = lemma_strings[0] - lemma = self.strings.add(lemma_string) - return lemma IDS = { From 3bba8e9245bc89494e5d0bf460397844e000d424 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Mon, 24 Sep 2018 23:58:08 +0200 Subject: [PATCH 005/207] Update structs --- spacy/structs.pxd | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/structs.pxd b/spacy/structs.pxd index cfcadc3d0..954ea19fe 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -2,6 +2,7 @@ from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t from .typedefs cimport flags_t, attr_t, hash_t from .parts_of_speech cimport univ_pos_t +from .morphology cimport univ_morph_t cdef struct LexemeC: @@ -71,3 +72,6 @@ cdef struct TokenC: int ent_iob attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth.. hash_t ent_id + + + From a3d2e616d53b93eacf9124a50c618b1888eb3718 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 25 Sep 2018 00:35:59 +0200 Subject: [PATCH 006/207] Restore previous morphology stuff --- spacy/morphology.pxd | 9 ++++++--- spacy/morphology.pyx | 11 +++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 2220cfcfc..aa2a4cb3c 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -1,5 +1,5 @@ from cymem.cymem cimport Pool -from preshed.maps cimport PreshMap +from preshed.maps cimport PreshMap, PreshMapArray from libc.stdint cimport uint64_t from murmurhash cimport mrmr @@ -17,14 +17,17 @@ cdef class Morphology: cdef public object lemmatizer cdef readonly object tag_map + cdef readonly object tag_names + cdef readonly object reverse_index + cdef readonly object exc + cdef readonly int n_tags cdef hash_t insert(self, RichTagC tag) except 0 cdef int assign_untagged(self, TokenC* token) except -1 cdef int assign_tag(self, TokenC* token, tag) except -1 cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 - cdef update_token_morph(self, TokenC* token, features) - cdef set_token_morph(self, TokenC* token, pos, features) + cdef update_morph(self, hash_t morph, features) cdef enum univ_morph_t: NIL = 0 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 3b74ecaae..2eb20776f 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -125,17 +125,17 @@ cdef class Morphology: # figure out why the statistical model fails. Related to Issue #220 if Lexeme.c_check_flag(token.lex, IS_SPACE): tag_id = self.reverse_index[self.strings.add('_SP')] + tag_str = self.tag_names[tag_id] + features = dict(self.tag_map.get(tag_str, {})) lemma = <attr_t>self._cache.get(tag_id, token.lex.orth) - if lemma == 0: - tag_str = self.tag_names[tag_id] - features = dict(self.tag_map.get(tag_str, {})) + if lemma == 0 and features: pos = self.strings.as_int(features.pop('POS')) lemma = self.lemmatize(pos, token.lex.orth, features) self._cache.set(tag_id, token.lex.orth, lemma) token.lemma = lemma token.pos = pos token.tag = self.strings[tag_str] - token.morph = self.add(attrs) + token.morph = self.add(features) cdef update_morph(self, hash_t morph, features): """Update a morphological analysis with new feature values.""" @@ -175,10 +175,9 @@ cpdef intify_features(StringStore strings, features): cdef hash_t hash_tag(RichTagC tag) nogil: return mrmr.hash64(&tag, sizeof(tag), 0) -cdef RichTagC create_rich_tag(pos_, features): +cdef RichTagC create_rich_tag(features): cdef RichTagC tag cdef univ_morph_t feature - tag.pos = get_int_tag(pos_) for feature in features: set_feature(&tag, feature, 1) return tag From be8cf39e16c0a518f568600171c8a7f1c314fab4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 25 Sep 2018 10:57:33 +0200 Subject: [PATCH 007/207] Fix morphology --- spacy/morphology.pxd | 1 + spacy/morphology.pyx | 68 +++++++++++++++++++++++++++++++++++++------- 2 files changed, 58 insertions(+), 11 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index aa2a4cb3c..05bc8ccc0 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -20,6 +20,7 @@ cdef class Morphology: cdef readonly object tag_names cdef readonly object reverse_index cdef readonly object exc + cdef readonly PreshMapArray _cache cdef readonly int n_tags cdef hash_t insert(self, RichTagC tag) except 0 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 2eb20776f..35571af49 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -58,15 +58,20 @@ cdef class Morphology: self.n_tags = len(tag_map) self.reverse_index = {} for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): + print(tag_str, attrs) self.tag_map[tag_str] = dict(attrs) - self.reverse_index[i] = self.strings.add(tag_str) + self.reverse_index[self.strings.add(tag_str)] = i self._cache = PreshMapArray(self.n_tags) self.exc = {} if exc is not None: for (tag_str, orth_str), attrs in exc.items(): self.add_special_case(tag_str, orth_str, attrs) - + + def __reduce__(self): + return (Morphology, (self.strings, self.tag_map, self.lemmatizer, + self.exc), None, None) + def add(self, features): """Insert a morphological analysis in the morphology table, if not already present. Returns the hash of the new analysis. @@ -88,6 +93,46 @@ cdef class Morphology: lemma_string = lemma_strings[0] lemma = self.strings.add(lemma_string) return lemma + + def add_special_case(self, unicode tag_str, unicode orth_str, attrs, + force=False): + """Add a special-case rule to the morphological analyser. Tokens whose + tag and orth match the rule will receive the specified properties. + + tag (unicode): The part-of-speech tag to key the exception. + orth (unicode): The word-form to key the exception. + """ + pass + ## TODO: Currently we've assumed that we know the number of tags -- + ## RichTagC is an array, and _cache is a PreshMapArray + ## This is really bad: it makes the morphology typed to the tagger + ## classes, which is all wrong. + #self.exc[(tag_str, orth_str)] = dict(attrs) + #tag = self.strings.add(tag_str) + #if tag not in self.reverse_index: + # return + #tag_id = self.reverse_index[tag] + #orth = self.strings[orth_str] + #cdef RichTagC rich_tag = self.rich_tags[tag_id] + #attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) + #cached = <MorphAnalysisC*>self._cache.get(tag_id, orth) + #if cached is NULL: + # cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC)) + #elif force: + # memset(cached, 0, sizeof(cached[0])) + #else: + # raise ValueError(Errors.E015.format(tag=tag_str, orth=orth_str)) + + #cached.tag = rich_tag + ## TODO: Refactor this to take arbitrary attributes. + #for name_id, value_id in attrs.items(): + # if name_id == LEMMA: + # cached.lemma = value_id + # else: + # self.assign_feature(&cached.tag.morph, name_id, value_id) + #if cached.lemma == 0: + # cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs) + #self._cache.set(tag_id, orth, <void*>cached) cdef hash_t insert(self, RichTagC tag) except 0: cdef hash_t key = hash_tag(tag) @@ -107,9 +152,8 @@ cdef class Morphology: lemma = self.lemmatizer.lookup(orth_str) token.lemma = self.strings.add(lemma) - cdef int assign_tag(self, TokenC* token, tag) except -1: - if isinstance(tag, basestring): - tag = self.strings.add(tag) + cdef int assign_tag(self, TokenC* token, tag_str) except -1: + cdef attr_t tag = self.strings.as_int(tag_str) if tag in self.reverse_index: tag_id = self.reverse_index[tag] self.assign_tag_id(token, tag_id) @@ -127,13 +171,15 @@ cdef class Morphology: tag_id = self.reverse_index[self.strings.add('_SP')] tag_str = self.tag_names[tag_id] features = dict(self.tag_map.get(tag_str, {})) - lemma = <attr_t>self._cache.get(tag_id, token.lex.orth) + cdef attr_t lemma = <attr_t>self._cache.get(tag_id, token.lex.orth) if lemma == 0 and features: - pos = self.strings.as_int(features.pop('POS')) + pos = self.strings.as_int(features.pop(POS)) lemma = self.lemmatize(pos, token.lex.orth, features) - self._cache.set(tag_id, token.lex.orth, lemma) + self._cache.set(tag_id, token.lex.orth, <void*>lemma) + else: + pos = 0 token.lemma = lemma - token.pos = pos + token.pos = <univ_pos_t>pos token.tag = self.strings[tag_str] token.morph = self.add(features) @@ -178,8 +224,8 @@ cdef hash_t hash_tag(RichTagC tag) nogil: cdef RichTagC create_rich_tag(features): cdef RichTagC tag cdef univ_morph_t feature - for feature in features: - set_feature(&tag, feature, 1) + #for feature in features: + # set_feature(&tag, feature, 1) return tag cdef tag_to_json(RichTagC tag): From e6dde97295022efe299bfa65a73c8d9b96eba8c4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 25 Sep 2018 10:57:59 +0200 Subject: [PATCH 008/207] Add function to make morphologizer model --- spacy/_ml.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 231f6a7a4..f37938671 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -483,7 +483,33 @@ class MultiSoftmax(Affine): return output__BO, finish_update -def build_tagger_model(class_nums, **cfg): +def build_tagger_model(nr_class, **cfg): + embed_size = util.env_opt('embed_size', 7000) + if 'token_vector_width' in cfg: + token_vector_width = cfg['token_vector_width'] + else: + token_vector_width = util.env_opt('token_vector_width', 128) + pretrained_vectors = cfg.get('pretrained_vectors') + subword_features = cfg.get('subword_features', True) + with Model.define_operators({'>>': chain, '+': add}): + if 'tok2vec' in cfg: + tok2vec = cfg['tok2vec'] + else: + tok2vec = Tok2Vec(token_vector_width, embed_size, + subword_features=subword_features, + pretrained_vectors=pretrained_vectors) + softmax = with_flatten( + Softmax(nr_class, token_vector_width)) + model = ( + tok2vec + >> softmax + ) + model.nI = None + model.tok2vec = tok2vec + model.softmax = softmax + return model + +def build_morphologizer_model(class_nums, **cfg): embed_size = util.env_opt('embed_size', 7000) if 'token_vector_width' in cfg: token_vector_width = cfg['token_vector_width'] From c2357d3ba075f0733505ca69ac114705872bb07b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 25 Sep 2018 10:58:13 +0200 Subject: [PATCH 009/207] Fix morphologizer class --- spacy/_morphologizer.pyx | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/spacy/_morphologizer.pyx b/spacy/_morphologizer.pyx index ca857296e..2fa092faa 100644 --- a/spacy/_morphologizer.pyx +++ b/spacy/_morphologizer.pyx @@ -12,7 +12,7 @@ from thinc.api import chain from thinc.neural.util import to_categorical, copy_array from . import util from .pipe import Pipe -from ._ml import Tok2Vec, build_tagger_model +from ._ml import Tok2Vec, build_morphologizer_model from ._ml import link_vectors_to_models, zero_init, flatten from ._ml import create_default_optimizer from .errors import Errors, TempErrors @@ -20,6 +20,7 @@ from .compat import json_dumps, basestring_ from .tokens.doc cimport Doc from .vocab cimport Vocab from .morphology cimport Morphology +from .pipeline import Pipe class Morphologizer(Pipe): @@ -50,7 +51,7 @@ class Morphologizer(Pipe): def __call__(self, doc): features, tokvecs = self.predict([doc]) - self.set_annotations([doc], tags, tensors=tokvecs) + self.set_annotations([doc], features, tensors=tokvecs) return doc def pipe(self, stream, batch_size=128, n_threads=-1): @@ -81,7 +82,7 @@ class Morphologizer(Pipe): cdef Doc doc cdef Vocab vocab = self.vocab for i, doc in enumerate(docs): - doc_feat_ids = batch_feat_ids[i] + doc_feat_ids = batch_feature_ids[i] if hasattr(doc_feat_ids, 'get'): doc_feat_ids = doc_feat_ids.get() # Convert the neuron indices into feature IDs. @@ -129,3 +130,9 @@ class Morphologizer(Pipe): def use_params(self, params): with self.model.use_params(params): yield + +def scores_to_guesses(scores, out_sizes): + raise NotImplementedError + +def feature_to_column(feature): + raise NotImplementedError From 8308c1525e921c7655470fb1b2e255c37dca68b3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 25 Sep 2018 15:18:21 +0200 Subject: [PATCH 010/207] Fix exception loading --- spacy/lemmatizer.py | 8 ++--- spacy/morphology.pxd | 2 ++ spacy/morphology.pyx | 85 ++++++++++++++++++++++++-------------------- 3 files changed, 53 insertions(+), 42 deletions(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 93121a0c5..483debb67 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -60,13 +60,13 @@ class Lemmatizer(object): return True elif univ_pos == 'adj' and morphology.get('Degree') == 'pos': return True - elif VerbForm_inf in morphology: + elif VerbForm_inf in morphology or 'VerbForm_inf' in morphology: return True - elif VerbForm_none in morphology: + elif VerbForm_none in morphology or 'VerbForm_none' in morphology: return True - elif Number_sing in morphology: + elif Number_sing in morphology or 'Number_sing' in morphology: return True - elif Degree_pos in morphology: + elif Degree_pos in morphology or 'Degree_pos' in morphology: return True else: return False diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 05bc8ccc0..7ba84d40c 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -30,6 +30,8 @@ cdef class Morphology: cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 cdef update_morph(self, hash_t morph, features) + cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1 + cdef enum univ_morph_t: NIL = 0 Animacy_anim = symbols.Animacy_anim diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 35571af49..f314a91a3 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -5,6 +5,7 @@ from __future__ import unicode_literals from libc.string cimport memset import ujson as json +from . import symbols from .attrs cimport POS, IS_SPACE from .attrs import LEMMA, intify_attrs from .parts_of_speech cimport SPACE @@ -17,6 +18,24 @@ from .errors import Errors def _normalize_props(props): """Transform deprecated string keys to correct names.""" out = {} + morph_keys = [ + 'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number', + 'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss', + 'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType', + 'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr', + 'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm', + 'NumValue', 'PartType', 'Polite', 'StyleVariant', + 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType', + 'Reflex', 'Negative', 'Mood', 'Aspect', 'Case', + 'Polarity', 'PrepCase', 'Animacy' # U20 + ] + props = dict(props) + for key in morph_keys: + if key in props: + attr = '%s_%s' % (key, props[key]) + if attr in IDS: + props.pop(key) + props[attr] = True for key, value in props.items(): if key == POS: if hasattr(value, 'upper'): @@ -58,15 +77,16 @@ cdef class Morphology: self.n_tags = len(tag_map) self.reverse_index = {} for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): - print(tag_str, attrs) + attrs = _normalize_props(attrs) self.tag_map[tag_str] = dict(attrs) self.reverse_index[self.strings.add(tag_str)] = i self._cache = PreshMapArray(self.n_tags) self.exc = {} if exc is not None: - for (tag_str, orth_str), attrs in exc.items(): - self.add_special_case(tag_str, orth_str, attrs) + for (tag, orth), attrs in exc.items(): + self.add_special_case( + self.strings.as_string(tag), self.strings.as_string(orth), attrs) def __reduce__(self): return (Morphology, (self.strings, self.tag_map, self.lemmatizer, @@ -102,37 +122,10 @@ cdef class Morphology: tag (unicode): The part-of-speech tag to key the exception. orth (unicode): The word-form to key the exception. """ - pass - ## TODO: Currently we've assumed that we know the number of tags -- - ## RichTagC is an array, and _cache is a PreshMapArray - ## This is really bad: it makes the morphology typed to the tagger - ## classes, which is all wrong. - #self.exc[(tag_str, orth_str)] = dict(attrs) - #tag = self.strings.add(tag_str) - #if tag not in self.reverse_index: - # return - #tag_id = self.reverse_index[tag] - #orth = self.strings[orth_str] - #cdef RichTagC rich_tag = self.rich_tags[tag_id] - #attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) - #cached = <MorphAnalysisC*>self._cache.get(tag_id, orth) - #if cached is NULL: - # cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC)) - #elif force: - # memset(cached, 0, sizeof(cached[0])) - #else: - # raise ValueError(Errors.E015.format(tag=tag_str, orth=orth_str)) - - #cached.tag = rich_tag - ## TODO: Refactor this to take arbitrary attributes. - #for name_id, value_id in attrs.items(): - # if name_id == LEMMA: - # cached.lemma = value_id - # else: - # self.assign_feature(&cached.tag.morph, name_id, value_id) - #if cached.lemma == 0: - # cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs) - #self._cache.set(tag_id, orth, <void*>cached) + attrs = dict(attrs) + attrs = _normalize_props(attrs) + attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) + self.exc[(tag_str, self.strings.add(orth_str))] = attrs cdef hash_t insert(self, RichTagC tag) except 0: cdef hash_t key = hash_tag(tag) @@ -171,17 +164,27 @@ cdef class Morphology: tag_id = self.reverse_index[self.strings.add('_SP')] tag_str = self.tag_names[tag_id] features = dict(self.tag_map.get(tag_str, {})) - cdef attr_t lemma = <attr_t>self._cache.get(tag_id, token.lex.orth) - if lemma == 0 and features: + if features: pos = self.strings.as_int(features.pop(POS)) - lemma = self.lemmatize(pos, token.lex.orth, features) - self._cache.set(tag_id, token.lex.orth, <void*>lemma) else: pos = 0 + cdef attr_t lemma = <attr_t>self._cache.get(tag_id, token.lex.orth) + if lemma == 0: + lemma = self.lemmatize(pos, token.lex.orth, features) + self._cache.set(tag_id, token.lex.orth, <void*>lemma) token.lemma = lemma token.pos = <univ_pos_t>pos token.tag = self.strings[tag_str] token.morph = self.add(features) + if (self.tag_names[tag_id], token.lex.orth) in self.exc: + self._assign_tag_from_exceptions(token, tag_id) + + cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1: + key = (self.tag_names[tag_id], token.lex.orth) + cdef dict attrs + attrs = self.exc[key] + token.pos = attrs.get(POS, token.pos) + token.lemma = attrs.get(LEMMA, token.lemma) cdef update_morph(self, hash_t morph, features): """Update a morphological analysis with new feature values.""" @@ -194,6 +197,12 @@ cdef class Morphology: morph = self.insert_tag(tag) return morph + def load_morph_exceptions(self, dict exc): + # Map (form, pos) to (lemma, rich tag) + for tag_str, entries in exc.items(): + for form_str, attrs in entries.items(): + self.add_special_case(tag_str, form_str, attrs) + def to_bytes(self): json_tags = [] for key in self.tags: From 6fe7c7256053aea202bfef155d4034149051fc1d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 25 Sep 2018 17:28:13 +0200 Subject: [PATCH 011/207] Reorder morphology enum, and add begin and end markers --- spacy/morphology.pxd | 190 +++++++++++++++++++++++++++++++------------ 1 file changed, 138 insertions(+), 52 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 7ba84d40c..96bba5260 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -34,15 +34,41 @@ cdef class Morphology: cdef enum univ_morph_t: NIL = 0 + begin_Abbr + Abbr_yes # cz, fi, sl, U + end_Abbr + begin_AdpType + AdpType_circ # U + AdpType_comprep # cz + AdpType_prep # cz, U + AdpType_post # U + AdpType_voc # cz + end_AdpType + begin_AdvType + AdvType_adadj + AdvType_cau + AdvType_deg + AdvType_ex + AdvType_loc + AdvType_man + AdvType_mod + AdvType_sta + AdvType_tim + end_AdvType + begin_Animacy Animacy_anim = symbols.Animacy_anim - Animacy_inan Animacy_hum + Animacy_inan Animacy_nhum + end_Animacy + begin_Aspect Aspect_freq Aspect_imp Aspect_mod Aspect_none Aspect_perf + end_Aspect + begin_Case Case_abe Case_abl Case_abs @@ -70,23 +96,46 @@ cdef enum univ_morph_t: Case_ter Case_tra Case_voc - Definite_two - Definite_def - Definite_red + end_Case + begin_ConjType + ConjType_comp # cz, U + ConjType_oper # cz, U + end_ConjType + begin_Connegative + Connegative_yes # fi + end_Connegative + begin_Definite Definite_cons # U20 + Definite_def Definite_ind + Definite_red + Definite_two + end_Definite + begin_Degree + Degree_abs Degree_cmp Degree_comp Degree_none Degree_pos Degree_sup - Degree_abs Degree_com Degree_dim # du + end_Degree + begin_Gender Gender_com Gender_fem Gender_masc Gender_neut + Gender_dat_masc # bq, U + Gender_dat_fem # bq, U + Gender_erg_masc # bq + Gender_erg_fem # bq + Gender_psor_masc # cz, sl, U + Gender_psor_fem # cz, sl, U + Gender_psor_neut # sl + + end_Gender + begin_Mood Mood_cnd Mood_imp Mood_ind @@ -94,11 +143,17 @@ cdef enum univ_morph_t: Mood_pot Mood_sub Mood_opt + end_Mood + begin_Negative Negative_neg Negative_pos Negative_yes + end_Negative + begin_Polarity Polarity_neg # U20 Polarity_pos # U20 + end_Polarity + begin_Number Number_com Number_dual Number_none @@ -106,6 +161,19 @@ cdef enum univ_morph_t: Number_sing Number_ptan # bg Number_count # bg + Number_abs_sing # bq, U + Number_abs_plur # bq, U + Number_dat_sing # bq, U + Number_dat_plur # bq, U + Number_erg_sing # bq, U + Number_erg_plur # bq, U + Number_psee_sing # U + Number_psee_plur # U + Number_psor_sing # cz, fi, sl, U + Number_psor_plur # cz, fi, sl, U + + end_Number + begin_NumType NumType_card NumType_dist NumType_frac @@ -114,11 +182,29 @@ cdef enum univ_morph_t: NumType_none NumType_ord NumType_sets + end_NumType + begin_Person Person_one Person_two Person_three Person_none + Person_abs_one # bq, U + Person_abs_two # bq, U + Person_abs_three # bq, U + Person_dat_one # bq, U + Person_dat_two # bq, U + Person_dat_three # bq, U + Person_erg_one # bq, U + Person_erg_two # bq, U + Person_erg_three # bq, U + Person_psor_one # fi, U + Person_psor_two # fi, U + Person_psor_three # fi, U + end_Person + begin_Poss Poss_yes + end_Poss + begin_PronType PronType_advPart PronType_art PronType_default @@ -132,11 +218,17 @@ cdef enum univ_morph_t: PronType_tot PronType_clit PronType_exc # es, ca, it, fa + end_PronType + begin_Reflex Reflex_yes + end_Reflex + begin_Tense Tense_fut Tense_imp Tense_past Tense_pres + end_Tense + begin_VerbForm VerbForm_fin VerbForm_ger VerbForm_inf @@ -149,29 +241,15 @@ cdef enum univ_morph_t: VerbForm_trans VerbForm_conv # U20 VerbForm_gdv # la + end_VerbForm + begin_Voice Voice_act Voice_cau Voice_pass Voice_mid # gkc Voice_int # hb - Abbr_yes # cz, fi, sl, U - AdpType_prep # cz, U - AdpType_post # U - AdpType_voc # cz - AdpType_comprep # cz - AdpType_circ # U - AdvType_man - AdvType_loc - AdvType_tim - AdvType_deg - AdvType_cau - AdvType_mod - AdvType_sta - AdvType_ex - AdvType_adadj - ConjType_oper # cz, U - ConjType_comp # cz, U - Connegative_yes # fi + end_Voice + begin_Derivation Derivation_minen # fi Derivation_sti # fi Derivation_inen # fi @@ -181,23 +259,26 @@ cdef enum univ_morph_t: Derivation_vs # fi Derivation_ttain # fi Derivation_ttaa # fi + end_Derivation + begin_Echo Echo_rdp # U Echo_ech # U + end_Echo + begin_Foreign Foreign_foreign # cz, fi, U Foreign_fscript # cz, fi, U Foreign_tscript # cz, U Foreign_yes # sl - Gender_dat_masc # bq, U - Gender_dat_fem # bq, U - Gender_erg_masc # bq - Gender_erg_fem # bq - Gender_psor_masc # cz, sl, U - Gender_psor_fem # cz, sl, U - Gender_psor_neut # sl + end_Foreign + begin_Hyph Hyph_yes # cz, U + end_Hyph + begin_InfForm InfForm_one # fi InfForm_two # fi InfForm_three # fi + end_InfForm + begin_NameType NameType_geo # U, cz NameType_prs # U, cz NameType_giv # U, cz @@ -206,46 +287,36 @@ cdef enum univ_morph_t: NameType_com # U, cz NameType_pro # U, cz NameType_oth # U, cz + end_NameType + begin_NounType NounType_com # U NounType_prop # U NounType_class # U - Number_abs_sing # bq, U - Number_abs_plur # bq, U - Number_dat_sing # bq, U - Number_dat_plur # bq, U - Number_erg_sing # bq, U - Number_erg_plur # bq, U - Number_psee_sing # U - Number_psee_plur # U - Number_psor_sing # cz, fi, sl, U - Number_psor_plur # cz, fi, sl, U + end_NounType + begin_NumForm NumForm_digit # cz, sl, U NumForm_roman # cz, sl, U NumForm_word # cz, sl, U + end_NumForm + begin_NumValue NumValue_one # cz, U NumValue_two # cz, U NumValue_three # cz, U + end_NumValue + begin_PartForm PartForm_pres # fi PartForm_past # fi PartForm_agt # fi PartForm_neg # fi + end_PartForm + begin_PartType PartType_mod # U PartType_emp # U PartType_res # U PartType_inf # U PartType_vbp # U - Person_abs_one # bq, U - Person_abs_two # bq, U - Person_abs_three # bq, U - Person_dat_one # bq, U - Person_dat_two # bq, U - Person_dat_three # bq, U - Person_erg_one # bq, U - Person_erg_two # bq, U - Person_erg_three # bq, U - Person_psor_one # fi, U - Person_psor_two # fi, U - Person_psor_three # fi, U + end_PartType + begin_Polite Polite_inf # bq, U Polite_pol # bq, U Polite_abs_inf # bq, U @@ -254,11 +325,19 @@ cdef enum univ_morph_t: Polite_erg_pol # bq, U Polite_dat_inf # bq, U Polite_dat_pol # bq, U + end_Polite + begin_Prefix Prefix_yes # U + end_Prefix + begin_PrepCase PrepCase_npr # cz PrepCase_pre # U + end_PrepCase + begin_PunctSide PunctSide_ini # U PunctSide_fin # U + end_PunctSide + begin_PunctType PunctType_peri # U PunctType_qest # U PunctType_excl # U @@ -268,6 +347,8 @@ cdef enum univ_morph_t: PunctType_colo # U PunctType_semi # U PunctType_dash # U + end_PunctType + begin_Style Style_arch # cz, fi, U Style_rare # cz, fi, U Style_poet # cz, U @@ -279,12 +360,17 @@ cdef enum univ_morph_t: Style_derg # cz, U Style_vulg # cz, U Style_yes # fi, U + end_Style + begin_StyleVariant StyleVariant_styleShort # cz StyleVariant_styleBound # cz, sl + end_StyleVariant + begin_VerbType VerbType_aux # U VerbType_cop # U VerbType_mod # U VerbType_light # U + end_VerbType cdef struct RichTagC: univ_pos_t pos From 4b7e772f5dbf75973320662976b997298c5d243d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 25 Sep 2018 17:28:34 +0200 Subject: [PATCH 012/207] Implement the is_animacy_feature etc functions --- spacy/morphology.pyx | 98 ++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index f314a91a3..b37107f09 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -156,10 +156,12 @@ cdef class Morphology: cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: if tag_id > self.n_tags: raise ValueError(Errors.E014.format(tag=tag_id)) - # TODO: It's pretty arbitrary to put this logic here. I guess the - # justification is that this is where the specific word and the tag - # interact. Still, we should have a better way to enforce this rule, or - # figure out why the statistical model fails. Related to Issue #220 + # Ensure spaces get tagged as space. + # It seems pretty arbitrary to put this logic here, but there's really + # nowhere better. I guess the justification is that this is where the + # specific word and the tag interact. Still, we should have a better + # way to enforce this rule, or figure out why the statistical model fails. + # Related to Issue #220 if Lexeme.c_check_flag(token.lex, IS_SPACE): tag_id = self.reverse_index[self.strings.add('_SP')] tag_str = self.tag_names[tag_id] @@ -198,7 +200,7 @@ cdef class Morphology: return morph def load_morph_exceptions(self, dict exc): - # Map (form, pos) to (lemma, rich tag) + # Map (form, pos) to attributes for tag_str, entries in exc.items(): for form_str, attrs in entries.items(): self.add_special_case(tag_str, form_str, attrs) @@ -333,130 +335,128 @@ cdef int set_feature(RichTagC* tag, univ_morph_t feature, int value) nogil: with gil: raise ValueError("Unknown feature: %d" % feature) -cdef int is_abbr_feature(univ_morph_t abbr) nogil: - return 0 +cdef int is_abbr_feature(univ_morph_t feature) nogil: + return feature > begin_Abbr and feature < end_Abbr cdef int is_adp_type_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_AdpType and feature < end_AdpType cdef int is_adv_type_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_AdvType and feature < end_AdvType cdef int is_animacy_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Animacy and feature < end_Animacy cdef int is_aspect_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Aspect and feature < end_Aspect cdef int is_case_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Case and feature < end_Case cdef int is_conj_type_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_ConjType and feature < end_ConjType cdef int is_connegative_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Connegative and feature < end_Connegative cdef int is_definite_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Definite and feature < end_Definite cdef int is_degree_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Degree and feature < end_Degree cdef int is_derivation_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Derivation and feature < end_Derivation cdef int is_echo_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Echo and feature < end_Echo cdef int is_foreign_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Foreign and feature < end_Foreign cdef int is_gender_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Gender and feature < end_Gender cdef int is_hyph_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Hyph and feature < begin_Hyph cdef int is_inf_form_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_InfForm and feature < end_InfForm cdef int is_mood_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Mood and feature < end_Mood cdef int is_negative_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Negative and feature < end_Negative cdef int is_number_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Number and feature < end_Number cdef int is_name_type_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_NameType and feature < end_NameType cdef int is_num_form_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_NumForm and feature < end_NumForm cdef int is_num_type_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_NumType and feature < end_NumType cdef int is_num_value_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_NumValue and feature < end_NumValue cdef int is_part_form_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_PartForm and feature < end_PartForm cdef int is_part_type_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_PartType and feature < end_PartType cdef int is_person_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Person and feature < end_Person cdef int is_polite_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Polite and feature < end_Polite cdef int is_polarity_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Polarity and feature < end_Polarity cdef int is_poss_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Poss and feature < end_Poss cdef int is_prefix_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Prefix and feature < end_Prefix cdef int is_prep_case_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_PrepCase and feature < end_PrepCase cdef int is_pron_type_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_PronType and feature < end_PronType cdef int is_punct_side_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_PunctSide and feature < end_PunctSide cdef int is_punct_type_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_PunctType and feature < end_PunctType cdef int is_reflex_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Reflex and feature < end_Reflex cdef int is_style_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Style and feature < end_Style cdef int is_style_variant_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_StyleVariant and feature < end_StyleVariant cdef int is_tense_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Tense and feature < end_Tense cdef int is_verb_form_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_VerbForm and feature < end_VerbForm cdef int is_voice_feature(univ_morph_t feature) nogil: - return 0 + return feature > begin_Voice and feature < end_Voice cdef int is_verb_type_feature(univ_morph_t feature) nogil: - return 0 - - + return feature > begin_VerbType and feature < end_VerbType IDS = { From 9998d9b9ffd18fd4752dad7f77508502b7f2958d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 25 Sep 2018 20:38:08 +0200 Subject: [PATCH 013/207] Start testing morphology class --- spacy/tests/morphology/__init__.py | 0 spacy/tests/morphology/test_morph_features.py | 25 +++++++++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 spacy/tests/morphology/__init__.py create mode 100644 spacy/tests/morphology/test_morph_features.py diff --git a/spacy/tests/morphology/__init__.py b/spacy/tests/morphology/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py new file mode 100644 index 000000000..f0610b745 --- /dev/null +++ b/spacy/tests/morphology/test_morph_features.py @@ -0,0 +1,25 @@ +from __future__ import unicode_literals +import pytest + +from ...morphology import Morphology +from ...strings import StringStore +from ...lemmatizer import Lemmatizer +from ...symbols import * + +@pytest.fixture +def morphology(): + return Morphology(StringStore(), {}, Lemmatizer()) + +def test_init(morphology): + pass + +def test_add_tag_with_string_names(morphology): + morphology.add({"Case_gen", "Number_Sing"}) + +def test_add_tag_with_int_ids(morphology): + morphology.add({Case_gen, Number_sing}) + +def test_add_tag_with_mix_strings_and_ints(morphology): + morphology.add({PunctSide_ini, 'VerbType_aux'}) + + From 34cab8cc4956305fad928c681b7911c004e51e8f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 25 Sep 2018 20:53:24 +0200 Subject: [PATCH 014/207] Update morphology API --- spacy/morphology.pxd | 2 +- spacy/morphology.pyx | 30 ++++++++++++++++-------------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 96bba5260..589f500c2 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -28,7 +28,7 @@ cdef class Morphology: cdef int assign_untagged(self, TokenC* token) except -1 cdef int assign_tag(self, TokenC* token, tag) except -1 cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 - cdef update_morph(self, hash_t morph, features) + cpdef update_morph_key(self, hash_t morph, features) cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index b37107f09..cc8cb1b19 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -96,10 +96,23 @@ cdef class Morphology: """Insert a morphological analysis in the morphology table, if not already present. Returns the hash of the new analysis. """ - features = intify_features(self.strings, features) + features = intify_features(features) cdef RichTagC tag = create_rich_tag(features) cdef hash_t key = self.insert(tag) return key + + cpdef update_morph_key(self, hash_t morph, features): + """Update a morphological analysis with new feature values.""" + tag = (<RichTagC*>self.tags.get(morph))[0] + cdef univ_morph_t feature + cdef int value + for feature_, value in features.items(): + feature = self.strings.as_int(feature_) + set_feature(&tag, feature, 1) + morph = self.insert_tag(tag) + return morph + + def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): if orth not in self.strings: @@ -188,17 +201,6 @@ cdef class Morphology: token.pos = attrs.get(POS, token.pos) token.lemma = attrs.get(LEMMA, token.lemma) - cdef update_morph(self, hash_t morph, features): - """Update a morphological analysis with new feature values.""" - tag = (<RichTagC*>self.tags.get(morph))[0] - cdef univ_morph_t feature - cdef int value - for feature_, value in features.items(): - feature = self.strings.as_int(feature_) - set_feature(&tag, feature, 1) - morph = self.insert_tag(tag) - return morph - def load_morph_exceptions(self, dict exc): # Map (form, pos) to attributes for tag_str, entries in exc.items(): @@ -226,8 +228,8 @@ cdef class Morphology: cpdef univ_pos_t get_int_tag(pos_): return <univ_pos_t>0 -cpdef intify_features(StringStore strings, features): - return {strings.as_int(feature) for feature in features} +cpdef intify_features(features): + return {IDS.get(feature, feature) for feature in features} cdef hash_t hash_tag(RichTagC tag) nogil: return mrmr.hash64(&tag, sizeof(tag), 0) From 51a297f93448df314612cf46d5a1946a6afb880b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 25 Sep 2018 21:07:08 +0200 Subject: [PATCH 015/207] Fix morphology add and update --- spacy/morphology.pxd | 2 +- spacy/morphology.pyx | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 589f500c2..bc8c44417 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -23,12 +23,12 @@ cdef class Morphology: cdef readonly PreshMapArray _cache cdef readonly int n_tags + cpdef update(self, hash_t morph, features) cdef hash_t insert(self, RichTagC tag) except 0 cdef int assign_untagged(self, TokenC* token) except -1 cdef int assign_tag(self, TokenC* token, tag) except -1 cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 - cpdef update_morph_key(self, hash_t morph, features) cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index cc8cb1b19..6e45cab81 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -101,15 +101,14 @@ cdef class Morphology: cdef hash_t key = self.insert(tag) return key - cpdef update_morph_key(self, hash_t morph, features): + cpdef update(self, hash_t morph, features): """Update a morphological analysis with new feature values.""" tag = (<RichTagC*>self.tags.get(morph))[0] + features = intify_features(features) cdef univ_morph_t feature - cdef int value - for feature_, value in features.items(): - feature = self.strings.as_int(feature_) + for feature in features: set_feature(&tag, feature, 1) - morph = self.insert_tag(tag) + morph = self.insert(tag) return morph @@ -237,8 +236,9 @@ cdef hash_t hash_tag(RichTagC tag) nogil: cdef RichTagC create_rich_tag(features): cdef RichTagC tag cdef univ_morph_t feature - #for feature in features: - # set_feature(&tag, feature, 1) + memset(&tag, 0, sizeof(tag)) + for feature in features: + set_feature(&tag, feature, 1) return tag cdef tag_to_json(RichTagC tag): From d89a1a91ac9dfe8f9a4941ece7a76aab8453c591 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 25 Sep 2018 21:07:48 +0200 Subject: [PATCH 016/207] Update morphology tests --- spacy/tests/morphology/test_morph_features.py | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py index f0610b745..391fd1337 100644 --- a/spacy/tests/morphology/test_morph_features.py +++ b/spacy/tests/morphology/test_morph_features.py @@ -13,13 +13,29 @@ def morphology(): def test_init(morphology): pass -def test_add_tag_with_string_names(morphology): - morphology.add({"Case_gen", "Number_Sing"}) +def test_add_morphology_with_string_names(morphology): + morphology.add({"Case_gen", "Number_sing"}) -def test_add_tag_with_int_ids(morphology): +def test_add_morphology_with_int_ids(morphology): morphology.add({Case_gen, Number_sing}) -def test_add_tag_with_mix_strings_and_ints(morphology): +def test_add_morphology_with_mix_strings_and_ints(morphology): morphology.add({PunctSide_ini, 'VerbType_aux'}) +def test_morphology_tags_hash_distinctly(morphology): + tag1 = morphology.add({PunctSide_ini, 'VerbType_aux'}) + tag2 = morphology.add({"Case_gen", 'Number_sing'}) + assert tag1 != tag2 + +def test_morphology_tags_hash_independent_of_order(morphology): + tag1 = morphology.add({"Case_gen", 'Number_sing'}) + tag2 = morphology.add({"Number_sing", "Case_gen"}) + assert tag1 == tag2 + +def test_update_morphology_tag(morphology): + tag1 = morphology.add({"Case_gen"}) + tag2 = morphology.update(tag1, {"Number_sing"}) + assert tag1 != tag2 + tag3 = morphology.add({"Number_sing", "Case_gen"}) + assert tag2 == tag3 From 834dfb0e9da3d2aef4b0aa4af9464983ee625ca6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 25 Sep 2018 21:32:05 +0200 Subject: [PATCH 017/207] Add morph attribute to GoldParse --- spacy/gold.pxd | 1 + spacy/gold.pyx | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/spacy/gold.pxd b/spacy/gold.pxd index a1550b1ef..fdf6f5440 100644 --- a/spacy/gold.pxd +++ b/spacy/gold.pxd @@ -24,6 +24,7 @@ cdef class GoldParse: cdef public int loss cdef public list words cdef public list tags + cdef public list morph cdef public list heads cdef public list labels cdef public dict orths diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 20a319f5d..c9be6d6f1 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -399,7 +399,7 @@ cdef class GoldParse: return cls(doc, words=words, tags=tags, heads=heads, deps=deps, entities=entities, make_projective=make_projective) - def __init__(self, doc, annot_tuples=None, words=None, tags=None, + def __init__(self, doc, annot_tuples=None, words=None, tags=None, morph=None, heads=None, deps=None, entities=None, make_projective=False, cats=None, **_): """Create a GoldParse. @@ -436,6 +436,8 @@ cdef class GoldParse: deps = [None for _ in doc] if entities is None: entities = [None for _ in doc] + if morph is None: + morph = [None for _ in doc] elif len(entities) == 0: entities = ['O' for _ in doc] elif not isinstance(entities[0], basestring): @@ -460,6 +462,7 @@ cdef class GoldParse: self.heads = [None] * len(doc) self.labels = [None] * len(doc) self.ner = [None] * len(doc) + self.morph = [None] * len(doc) # This needs to be done before we align the words if make_projective and heads is not None and deps is not None: @@ -487,10 +490,12 @@ cdef class GoldParse: self.heads[i] = None self.labels[i] = None self.ner[i] = 'O' + self.morph[i] = set() if gold_i is None: if i in i2j_multi: self.words[i] = words[i2j_multi[i]] self.tags[i] = tags[i2j_multi[i]] + self.morph[i] = morph[i2j_multi[i]] is_last = i2j_multi[i] != i2j_multi.get(i+1) is_first = i2j_multi[i] != i2j_multi.get(i-1) # Set next word in multi-token span as head, until last From 2ba10493f719d442c7d56f07883b195bbc8217f8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 25 Sep 2018 21:32:24 +0200 Subject: [PATCH 018/207] Read morphology into gold standard in ud-train --- spacy/cli/ud_train.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/spacy/cli/ud_train.py b/spacy/cli/ud_train.py index 4c0b3c7eb..b7e283cfb 100644 --- a/spacy/cli/ud_train.py +++ b/spacy/cli/ud_train.py @@ -74,6 +74,7 @@ def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False, head = int(head)-1 if head != '0' else id_ sent['words'].append(word) sent['tags'].append(tag) + sent['morph'].append(_parse_morph_string(morph)) sent['heads'].append(head) sent['deps'].append('ROOT' if dep == 'root' else dep) sent['spaces'].append(space_after == '_') @@ -101,6 +102,16 @@ def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False, return docs, golds return docs, golds +def _parse_morph_string(morph_string): + if morph_string == '_': + return None + output = [] + replacements = {'1': 'one', '2': 'two', '3': 'three'} + for feature in morph_string.split('|'): + key, value = feature.split('=') + value = replacements.get(value, value) + output.append('%s_%s' % (key, value.lower())) + return set(output) def read_conllu(file_): docs = [] From fb0abddd9ebfed4017777e2483fd630c2f711752 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 25 Sep 2018 21:34:53 +0200 Subject: [PATCH 019/207] Call morph morphology in GoldParse --- spacy/cli/ud_train.py | 2 +- spacy/gold.pxd | 2 +- spacy/gold.pyx | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/spacy/cli/ud_train.py b/spacy/cli/ud_train.py index b7e283cfb..9a0b5e10c 100644 --- a/spacy/cli/ud_train.py +++ b/spacy/cli/ud_train.py @@ -74,7 +74,7 @@ def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False, head = int(head)-1 if head != '0' else id_ sent['words'].append(word) sent['tags'].append(tag) - sent['morph'].append(_parse_morph_string(morph)) + sent['morphology'].append(_parse_morph_string(morph)) sent['heads'].append(head) sent['deps'].append('ROOT' if dep == 'root' else dep) sent['spaces'].append(space_after == '_') diff --git a/spacy/gold.pxd b/spacy/gold.pxd index fdf6f5440..ce066f049 100644 --- a/spacy/gold.pxd +++ b/spacy/gold.pxd @@ -24,7 +24,7 @@ cdef class GoldParse: cdef public int loss cdef public list words cdef public list tags - cdef public list morph + cdef public list morphology cdef public list heads cdef public list labels cdef public dict orths diff --git a/spacy/gold.pyx b/spacy/gold.pyx index c9be6d6f1..65a3932be 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -399,7 +399,7 @@ cdef class GoldParse: return cls(doc, words=words, tags=tags, heads=heads, deps=deps, entities=entities, make_projective=make_projective) - def __init__(self, doc, annot_tuples=None, words=None, tags=None, morph=None, + def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None, heads=None, deps=None, entities=None, make_projective=False, cats=None, **_): """Create a GoldParse. @@ -436,8 +436,8 @@ cdef class GoldParse: deps = [None for _ in doc] if entities is None: entities = [None for _ in doc] - if morph is None: - morph = [None for _ in doc] + if morphology is None: + morphology = [None for _ in doc] elif len(entities) == 0: entities = ['O' for _ in doc] elif not isinstance(entities[0], basestring): @@ -462,7 +462,7 @@ cdef class GoldParse: self.heads = [None] * len(doc) self.labels = [None] * len(doc) self.ner = [None] * len(doc) - self.morph = [None] * len(doc) + self.morphology = [None] * len(doc) # This needs to be done before we align the words if make_projective and heads is not None and deps is not None: @@ -490,12 +490,12 @@ cdef class GoldParse: self.heads[i] = None self.labels[i] = None self.ner[i] = 'O' - self.morph[i] = set() + self.morphology[i] = set() if gold_i is None: if i in i2j_multi: self.words[i] = words[i2j_multi[i]] self.tags[i] = tags[i2j_multi[i]] - self.morph[i] = morph[i2j_multi[i]] + self.morphology[i] = morphology[i2j_multi[i]] is_last = i2j_multi[i] != i2j_multi.get(i+1) is_first = i2j_multi[i] != i2j_multi.get(i-1) # Set next word in multi-token span as head, until last From 53eb96db0908cb9884dec3cfcdc591ea2f134488 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 25 Sep 2018 22:12:32 +0200 Subject: [PATCH 020/207] Fix definition of morphology model --- spacy/_ml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index f37938671..813f5ab7f 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -524,8 +524,8 @@ def build_morphologizer_model(class_nums, **cfg): tok2vec = Tok2Vec(token_vector_width, embed_size, subword_features=subword_features, pretrained_vectors=pretrained_vectors) - softmax = with_flatten( - MultiSoftmax(class_nums, token_vector_width)) + softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width)) + softmax.out_sizes = class_nums model = ( tok2vec >> softmax From d0dc032842ed2e3f7ace3a4267fa6ad2e6d4e85d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 25 Sep 2018 22:12:54 +0200 Subject: [PATCH 021/207] Fill in missing morphologizer methods --- spacy/_morphologizer.pyx | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/spacy/_morphologizer.pyx b/spacy/_morphologizer.pyx index 2fa092faa..a3d3a301a 100644 --- a/spacy/_morphologizer.pyx +++ b/spacy/_morphologizer.pyx @@ -9,9 +9,9 @@ from .util import msgpack from .util import msgpack_numpy from thinc.api import chain -from thinc.neural.util import to_categorical, copy_array +from thinc.neural.util import to_categorical, copy_array, get_array_module from . import util -from .pipe import Pipe +from .pipeline import Pipe from ._ml import Tok2Vec, build_morphologizer_model from ._ml import link_vectors_to_models, zero_init, flatten from ._ml import create_default_optimizer @@ -20,6 +20,7 @@ from .compat import json_dumps, basestring_ from .tokens.doc cimport Doc from .vocab cimport Vocab from .morphology cimport Morphology +from .morphology import parse_feature from .pipeline import Pipe @@ -118,7 +119,7 @@ class Morphologizer(Pipe): target[idx] = guesses[idx] else: for feature in features: - column = feature_to_column(feature) # TODO + _, column = parse_feature(feature) target[idx, column] = 1 idx += 1 target = self.model.ops.xp.array(target, dtype='f') @@ -132,7 +133,10 @@ class Morphologizer(Pipe): yield def scores_to_guesses(scores, out_sizes): - raise NotImplementedError - -def feature_to_column(feature): - raise NotImplementedError + xp = get_array_module(scores) + guesses = xp.zeros((scores.shape[0], len(out_sizes)), dtype='i') + offset = 0 + for i, size in enumerate(out_sizes): + guesses[:, i] = scores[:, offset : offset + size].argmax(axis=1) + offset += size + return guesses From a4fc39788014c7929bc579b699d56853947b4945 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 25 Sep 2018 22:13:10 +0200 Subject: [PATCH 022/207] Add helper to parse features into field and column IDs --- spacy/morphology.pyx | 57 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 6e45cab81..3ba50123c 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -52,6 +52,16 @@ def _normalize_props(props): return out +def parse_feature(feature): + if not hasattr(feature, 'split'): + feature = NAMES[feature] + key, value = feature.split('_') + begin = 'begin_%s' % key + offset = IDS[feature] - IDS[begin] + field_id = FIELDS[key] + return (field_id, offset) + + cdef class Morphology: '''Store the possible morphological analyses for a language, and index them by hash. @@ -716,7 +726,52 @@ IDS = { } -NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] +FIELDS = { + 'Abbr': 0, + 'AdpType': 1, + 'AdvType': 2, + 'Animacy': 3, + 'Aspect': 4, + 'Case': 5, + 'ConjType': 6, + 'Connegative': 7, + 'Definite': 8, + 'Degree': 9, + 'Derivation': 10, + 'Echo': 11, + 'Foreign': 12, + 'Gender': 13, + 'Hyph': 14, + 'InfForm': 15, + 'Mood': 16, + 'Negative': 17, + 'Number': 18, + 'NameType': 19, + 'NumForm': 20, + 'NumType': 21, + 'NumValue': 22, + 'PartForm': 23, + 'PartType': 24, + 'Person': 25, + 'Polite': 26, + 'Polarity': 27, + 'Poss': 28, + 'Prefix': 29, + 'PrepCase': 30, + 'PronType': 31, + 'PunctSide': 32, + 'PunctType': 33, + 'Reflex': 34, + 'Style': 35, + 'StyleVariant': 36, + 'Tense': 37, + 'VerbForm': 38, + 'Voice': 39, + 'VerbType': 40 +} + + +NAMES = {value: key for key, value in IDS.items()} # Unfortunate hack here, to work around problem with long cpdef enum # (which is generating an enormous amount of C++ in Cython 0.24+) # We keep the enum cdef, and just make sure the names are available to Python From 031b0d2a3ad5bd8b9a432acb857b4babbc153923 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 25 Sep 2018 22:13:22 +0200 Subject: [PATCH 023/207] Build morphologizer in setup.py --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index e22e28f47..4a587386b 100755 --- a/setup.py +++ b/setup.py @@ -26,6 +26,7 @@ MOD_NAMES = [ 'spacy.attrs', 'spacy.morphology', 'spacy.pipeline', + 'spacy._morphologizer', 'spacy.syntax.stateclass', 'spacy.syntax._state', 'spacy.tokenizer', From 2be15fa7d25709599ea0e4af26933a423529ece1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 25 Sep 2018 23:03:43 +0200 Subject: [PATCH 024/207] Fix Python feature enum in morphology --- spacy/morphology.pyx | 601 +++++++++++++++++++++++++------------------ 1 file changed, 346 insertions(+), 255 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 3ba50123c..870f05a87 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -62,6 +62,12 @@ def parse_feature(feature): return (field_id, offset) +def get_field_size(field): + begin = 'begin_%s' % field + end = 'end_%s' % field + return (IDS[end] - IDS[begin]) - 1 + + cdef class Morphology: '''Store the possible morphological analyses for a language, and index them by hash. @@ -471,261 +477,6 @@ cdef int is_verb_type_feature(univ_morph_t feature) nogil: return feature > begin_VerbType and feature < end_VerbType -IDS = { - "Animacy_anim": Animacy_anim, - "Animacy_inan": Animacy_inan, - "Animacy_hum": Animacy_hum, # U20 - "Animacy_nhum": Animacy_nhum, - "Aspect_freq": Aspect_freq, - "Aspect_imp": Aspect_imp, - "Aspect_mod": Aspect_mod, - "Aspect_none": Aspect_none, - "Aspect_perf": Aspect_perf, - "Case_abe": Case_abe, - "Case_abl": Case_abl, - "Case_abs": Case_abs, - "Case_acc": Case_acc, - "Case_ade": Case_ade, - "Case_all": Case_all, - "Case_cau": Case_cau, - "Case_com": Case_com, - "Case_dat": Case_dat, - "Case_del": Case_del, - "Case_dis": Case_dis, - "Case_ela": Case_ela, - "Case_ess": Case_ess, - "Case_gen": Case_gen, - "Case_ill": Case_ill, - "Case_ine": Case_ine, - "Case_ins": Case_ins, - "Case_loc": Case_loc, - "Case_lat": Case_lat, - "Case_nom": Case_nom, - "Case_par": Case_par, - "Case_sub": Case_sub, - "Case_sup": Case_sup, - "Case_tem": Case_tem, - "Case_ter": Case_ter, - "Case_tra": Case_tra, - "Case_voc": Case_voc, - "Definite_two": Definite_two, - "Definite_def": Definite_def, - "Definite_red": Definite_red, - "Definite_cons": Definite_cons, # U20 - "Definite_ind": Definite_ind, - "Degree_cmp": Degree_cmp, - "Degree_comp": Degree_comp, - "Degree_none": Degree_none, - "Degree_pos": Degree_pos, - "Degree_sup": Degree_sup, - "Degree_abs": Degree_abs, - "Degree_com": Degree_com, - "Degree_dim ": Degree_dim, # du - "Gender_com": Gender_com, - "Gender_fem": Gender_fem, - "Gender_masc": Gender_masc, - "Gender_neut": Gender_neut, - "Mood_cnd": Mood_cnd, - "Mood_imp": Mood_imp, - "Mood_ind": Mood_ind, - "Mood_n": Mood_n, - "Mood_pot": Mood_pot, - "Mood_sub": Mood_sub, - "Mood_opt": Mood_opt, - "Negative_neg": Negative_neg, - "Negative_pos": Negative_pos, - "Negative_yes": Negative_yes, - "Polarity_neg": Polarity_neg, # U20 - "Polarity_pos": Polarity_pos, # U20 - "Number_com": Number_com, - "Number_dual": Number_dual, - "Number_none": Number_none, - "Number_plur": Number_plur, - "Number_sing": Number_sing, - "Number_ptan ": Number_ptan, # bg - "Number_count ": Number_count, # bg - "NumType_card": NumType_card, - "NumType_dist": NumType_dist, - "NumType_frac": NumType_frac, - "NumType_gen": NumType_gen, - "NumType_mult": NumType_mult, - "NumType_none": NumType_none, - "NumType_ord": NumType_ord, - "NumType_sets": NumType_sets, - "Person_one": Person_one, - "Person_two": Person_two, - "Person_three": Person_three, - "Person_none": Person_none, - "Poss_yes": Poss_yes, - "PronType_advPart": PronType_advPart, - "PronType_art": PronType_art, - "PronType_default": PronType_default, - "PronType_dem": PronType_dem, - "PronType_ind": PronType_ind, - "PronType_int": PronType_int, - "PronType_neg": PronType_neg, - "PronType_prs": PronType_prs, - "PronType_rcp": PronType_rcp, - "PronType_rel": PronType_rel, - "PronType_tot": PronType_tot, - "PronType_clit": PronType_clit, - "PronType_exc ": PronType_exc, # es, ca, it, fa, - "Reflex_yes": Reflex_yes, - "Tense_fut": Tense_fut, - "Tense_imp": Tense_imp, - "Tense_past": Tense_past, - "Tense_pres": Tense_pres, - "VerbForm_fin": VerbForm_fin, - "VerbForm_ger": VerbForm_ger, - "VerbForm_inf": VerbForm_inf, - "VerbForm_none": VerbForm_none, - "VerbForm_part": VerbForm_part, - "VerbForm_partFut": VerbForm_partFut, - "VerbForm_partPast": VerbForm_partPast, - "VerbForm_partPres": VerbForm_partPres, - "VerbForm_sup": VerbForm_sup, - "VerbForm_trans": VerbForm_trans, - "VerbForm_conv": VerbForm_conv, # U20 - "VerbForm_gdv ": VerbForm_gdv, # la, - "Voice_act": Voice_act, - "Voice_cau": Voice_cau, - "Voice_pass": Voice_pass, - "Voice_mid ": Voice_mid, # gkc, - "Voice_int ": Voice_int, # hb, - "Abbr_yes ": Abbr_yes, # cz, fi, sl, U, - "AdpType_prep ": AdpType_prep, # cz, U, - "AdpType_post ": AdpType_post, # U, - "AdpType_voc ": AdpType_voc, # cz, - "AdpType_comprep ": AdpType_comprep, # cz, - "AdpType_circ ": AdpType_circ, # U, - "AdvType_man": AdvType_man, - "AdvType_loc": AdvType_loc, - "AdvType_tim": AdvType_tim, - "AdvType_deg": AdvType_deg, - "AdvType_cau": AdvType_cau, - "AdvType_mod": AdvType_mod, - "AdvType_sta": AdvType_sta, - "AdvType_ex": AdvType_ex, - "AdvType_adadj": AdvType_adadj, - "ConjType_oper ": ConjType_oper, # cz, U, - "ConjType_comp ": ConjType_comp, # cz, U, - "Connegative_yes ": Connegative_yes, # fi, - "Derivation_minen ": Derivation_minen, # fi, - "Derivation_sti ": Derivation_sti, # fi, - "Derivation_inen ": Derivation_inen, # fi, - "Derivation_lainen ": Derivation_lainen, # fi, - "Derivation_ja ": Derivation_ja, # fi, - "Derivation_ton ": Derivation_ton, # fi, - "Derivation_vs ": Derivation_vs, # fi, - "Derivation_ttain ": Derivation_ttain, # fi, - "Derivation_ttaa ": Derivation_ttaa, # fi, - "Echo_rdp ": Echo_rdp, # U, - "Echo_ech ": Echo_ech, # U, - "Foreign_foreign ": Foreign_foreign, # cz, fi, U, - "Foreign_fscript ": Foreign_fscript, # cz, fi, U, - "Foreign_tscript ": Foreign_tscript, # cz, U, - "Foreign_yes ": Foreign_yes, # sl, - "Gender_dat_masc ": Gender_dat_masc, # bq, U, - "Gender_dat_fem ": Gender_dat_fem, # bq, U, - "Gender_erg_masc ": Gender_erg_masc, # bq, - "Gender_erg_fem ": Gender_erg_fem, # bq, - "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U, - "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U, - "Gender_psor_neut ": Gender_psor_neut, # sl, - "Hyph_yes ": Hyph_yes, # cz, U, - "InfForm_one ": InfForm_one, # fi, - "InfForm_two ": InfForm_two, # fi, - "InfForm_three ": InfForm_three, # fi, - "NameType_geo ": NameType_geo, # U, cz, - "NameType_prs ": NameType_prs, # U, cz, - "NameType_giv ": NameType_giv, # U, cz, - "NameType_sur ": NameType_sur, # U, cz, - "NameType_nat ": NameType_nat, # U, cz, - "NameType_com ": NameType_com, # U, cz, - "NameType_pro ": NameType_pro, # U, cz, - "NameType_oth ": NameType_oth, # U, cz, - "NounType_com ": NounType_com, # U, - "NounType_prop ": NounType_prop, # U, - "NounType_class ": NounType_class, # U, - "Number_abs_sing ": Number_abs_sing, # bq, U, - "Number_abs_plur ": Number_abs_plur, # bq, U, - "Number_dat_sing ": Number_dat_sing, # bq, U, - "Number_dat_plur ": Number_dat_plur, # bq, U, - "Number_erg_sing ": Number_erg_sing, # bq, U, - "Number_erg_plur ": Number_erg_plur, # bq, U, - "Number_psee_sing ": Number_psee_sing, # U, - "Number_psee_plur ": Number_psee_plur, # U, - "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, - "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, - "NumForm_digit ": NumForm_digit, # cz, sl, U, - "NumForm_roman ": NumForm_roman, # cz, sl, U, - "NumForm_word ": NumForm_word, # cz, sl, U, - "NumValue_one ": NumValue_one, # cz, U, - "NumValue_two ": NumValue_two, # cz, U, - "NumValue_three ": NumValue_three, # cz, U, - "PartForm_pres ": PartForm_pres, # fi, - "PartForm_past ": PartForm_past, # fi, - "PartForm_agt ": PartForm_agt, # fi, - "PartForm_neg ": PartForm_neg, # fi, - "PartType_mod ": PartType_mod, # U, - "PartType_emp ": PartType_emp, # U, - "PartType_res ": PartType_res, # U, - "PartType_inf ": PartType_inf, # U, - "PartType_vbp ": PartType_vbp, # U, - "Person_abs_one ": Person_abs_one, # bq, U, - "Person_abs_two ": Person_abs_two, # bq, U, - "Person_abs_three ": Person_abs_three, # bq, U, - "Person_dat_one ": Person_dat_one, # bq, U, - "Person_dat_two ": Person_dat_two, # bq, U, - "Person_dat_three ": Person_dat_three, # bq, U, - "Person_erg_one ": Person_erg_one, # bq, U, - "Person_erg_two ": Person_erg_two, # bq, U, - "Person_erg_three ": Person_erg_three, # bq, U, - "Person_psor_one ": Person_psor_one, # fi, U, - "Person_psor_two ": Person_psor_two, # fi, U, - "Person_psor_three ": Person_psor_three, # fi, U, - "Polite_inf ": Polite_inf, # bq, U, - "Polite_pol ": Polite_pol, # bq, U, - "Polite_abs_inf ": Polite_abs_inf, # bq, U, - "Polite_abs_pol ": Polite_abs_pol, # bq, U, - "Polite_erg_inf ": Polite_erg_inf, # bq, U, - "Polite_erg_pol ": Polite_erg_pol, # bq, U, - "Polite_dat_inf ": Polite_dat_inf, # bq, U, - "Polite_dat_pol ": Polite_dat_pol, # bq, U, - "Prefix_yes ": Prefix_yes, # U, - "PrepCase_npr ": PrepCase_npr, # cz, - "PrepCase_pre ": PrepCase_pre, # U, - "PunctSide_ini ": PunctSide_ini, # U, - "PunctSide_fin ": PunctSide_fin, # U, - "PunctType_peri ": PunctType_peri, # U, - "PunctType_qest ": PunctType_qest, # U, - "PunctType_excl ": PunctType_excl, # U, - "PunctType_quot ": PunctType_quot, # U, - "PunctType_brck ": PunctType_brck, # U, - "PunctType_comm ": PunctType_comm, # U, - "PunctType_colo ": PunctType_colo, # U, - "PunctType_semi ": PunctType_semi, # U, - "PunctType_dash ": PunctType_dash, # U, - "Style_arch ": Style_arch, # cz, fi, U, - "Style_rare ": Style_rare, # cz, fi, U, - "Style_poet ": Style_poet, # cz, U, - "Style_norm ": Style_norm, # cz, U, - "Style_coll ": Style_coll, # cz, U, - "Style_vrnc ": Style_vrnc, # cz, U, - "Style_sing ": Style_sing, # cz, U, - "Style_expr ": Style_expr, # cz, U, - "Style_derg ": Style_derg, # cz, U, - "Style_vulg ": Style_vulg, # cz, U, - "Style_yes ": Style_yes, # fi, U, - "StyleVariant_styleShort ": StyleVariant_styleShort, # cz, - "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl, - "VerbType_aux ": VerbType_aux, # U, - "VerbType_cop ": VerbType_cop, # U, - "VerbType_mod ": VerbType_mod, # U, - "VerbType_light ": VerbType_light, # U, -} - - FIELDS = { 'Abbr': 0, 'AdpType': 1, @@ -770,6 +521,346 @@ FIELDS = { 'VerbType': 40 } +IDS = { + "begin_Abbr": begin_Abbr, + "Abbr_yes ": Abbr_yes , + "end_Abbr": end_Abbr, + "begin_AdpType": begin_AdpType, + "AdpType_circ": AdpType_circ, + "AdpType_comprep": AdpType_comprep, + "AdpType_prep ": AdpType_prep , + "AdpType_post": AdpType_post, + "AdpType_voc": AdpType_voc, + "end_AdpType": end_AdpType, + "begin_AdvType": begin_AdvType, + "AdvType_adadj": AdvType_adadj, + "AdvType_cau": AdvType_cau, + "AdvType_deg": AdvType_deg, + "AdvType_ex": AdvType_ex, + "AdvType_loc": AdvType_loc, + "AdvType_man": AdvType_man, + "AdvType_mod": AdvType_mod, + "AdvType_sta": AdvType_sta, + "AdvType_tim": AdvType_tim, + "end_AdvType": end_AdvType, + "begin_Animacy": begin_Animacy, + "Animacy_anim": Animacy_anim, + "Animacy_hum": Animacy_hum, + "Animacy_inan": Animacy_inan, + "Animacy_nhum": Animacy_nhum, + "end_Animacy": end_Animacy, + "begin_Aspect": begin_Aspect, + "Aspect_freq": Aspect_freq, + "Aspect_imp": Aspect_imp, + "Aspect_mod": Aspect_mod, + "Aspect_none": Aspect_none, + "Aspect_perf": Aspect_perf, + "end_Aspect": end_Aspect, + "begin_Case": begin_Case, + "Case_abe": Case_abe, + "Case_abl": Case_abl, + "Case_abs": Case_abs, + "Case_acc": Case_acc, + "Case_ade": Case_ade, + "Case_all": Case_all, + "Case_cau": Case_cau, + "Case_com": Case_com, + "Case_dat": Case_dat, + "Case_del": Case_del, + "Case_dis": Case_dis, + "Case_ela": Case_ela, + "Case_ess": Case_ess, + "Case_gen": Case_gen, + "Case_ill": Case_ill, + "Case_ine": Case_ine, + "Case_ins": Case_ins, + "Case_loc": Case_loc, + "Case_lat": Case_lat, + "Case_nom": Case_nom, + "Case_par": Case_par, + "Case_sub": Case_sub, + "Case_sup": Case_sup, + "Case_tem": Case_tem, + "Case_ter": Case_ter, + "Case_tra": Case_tra, + "Case_voc": Case_voc, + "end_Case": end_Case, + "begin_ConjType": begin_ConjType, + "ConjType_comp ": ConjType_comp , + "ConjType_oper": ConjType_oper, + "end_ConjType": end_ConjType, + "begin_Connegative": begin_Connegative, + "Connegative_yes": Connegative_yes, + "end_Connegative": end_Connegative, + "begin_Definite": begin_Definite, + "Definite_cons": Definite_cons, + "Definite_def": Definite_def, + "Definite_ind": Definite_ind, + "Definite_red": Definite_red, + "Definite_two": Definite_two, + "end_Definite": end_Definite, + "begin_Degree": begin_Degree, + "Degree_abs": Degree_abs, + "Degree_cmp": Degree_cmp, + "Degree_comp": Degree_comp, + "Degree_none": Degree_none, + "Degree_pos": Degree_pos, + "Degree_sup": Degree_sup, + "Degree_com": Degree_com, + "Degree_dim": Degree_dim, + "end_Degree": end_Degree, + "begin_Gender": begin_Gender, + "Gender_com": Gender_com, + "Gender_fem": Gender_fem, + "Gender_masc": Gender_masc, + "Gender_neut": Gender_neut, + "Gender_dat_masc": Gender_dat_masc, + "Gender_dat_fem": Gender_dat_fem, + "Gender_erg_masc": Gender_erg_masc, + "Gender_erg_fem": Gender_erg_fem, + "Gender_psor_masc": Gender_psor_masc, + "Gender_psor_fem": Gender_psor_fem, + "Gender_psor_neut": Gender_psor_neut, + "end_Gender": end_Gender, + "begin_Mood": begin_Mood, + "Mood_cnd": Mood_cnd, + "Mood_imp": Mood_imp, + "Mood_ind": Mood_ind, + "Mood_n": Mood_n, + "Mood_pot": Mood_pot, + "Mood_sub": Mood_sub, + "Mood_opt": Mood_opt, + "end_Mood": end_Mood, + "begin_Negative": begin_Negative, + "Negative_neg": Negative_neg, + "Negative_pos": Negative_pos, + "Negative_yes": Negative_yes, + "end_Negative": end_Negative, + "begin_Polarity": begin_Polarity, + "Polarity_neg": Polarity_neg, + "Polarity_pos": Polarity_pos, + "end_Polarity": end_Polarity, + "begin_Number": begin_Number, + "Number_com": Number_com, + "Number_dual": Number_dual, + "Number_none": Number_none, + "Number_plur": Number_plur, + "Number_sing": Number_sing, + "Number_ptan": Number_ptan, + "Number_count": Number_count, + "Number_abs_sing": Number_abs_sing, + "Number_abs_plur": Number_abs_plur, + "Number_dat_sing": Number_dat_sing, + "Number_dat_plur": Number_dat_plur, + "Number_erg_sing": Number_erg_sing, + "Number_erg_plur": Number_erg_plur, + "Number_psee_sing": Number_psee_sing, + "Number_psee_plur": Number_psee_plur, + "Number_psor_sing": Number_psor_sing, + "Number_psor_plur": Number_psor_plur, + "end_Number": end_Number, + "begin_NumType": begin_NumType, + "NumType_card": NumType_card, + "NumType_dist": NumType_dist, + "NumType_frac": NumType_frac, + "NumType_gen": NumType_gen, + "NumType_mult": NumType_mult, + "NumType_none": NumType_none, + "NumType_ord": NumType_ord, + "NumType_sets": NumType_sets, + "end_NumType": end_NumType, + "begin_Person": begin_Person, + "Person_one": Person_one, + "Person_two": Person_two, + "Person_three": Person_three, + "Person_none": Person_none, + "Person_abs_one": Person_abs_one, + "Person_abs_two": Person_abs_two, + "Person_abs_three": Person_abs_three, + "Person_dat_one": Person_dat_one, + "Person_dat_two": Person_dat_two, + "Person_dat_three": Person_dat_three, + "Person_erg_one": Person_erg_one, + "Person_erg_two": Person_erg_two, + "Person_erg_three": Person_erg_three, + "Person_psor_one": Person_psor_one, + "Person_psor_two": Person_psor_two, + "Person_psor_three": Person_psor_three, + "end_Person": end_Person, + "begin_Poss": begin_Poss, + "Poss_yes": Poss_yes, + "end_Poss": end_Poss, + "begin_PronType": begin_PronType, + "PronType_advPart": PronType_advPart, + "PronType_art": PronType_art, + "PronType_default": PronType_default, + "PronType_dem": PronType_dem, + "PronType_ind": PronType_ind, + "PronType_int": PronType_int, + "PronType_neg": PronType_neg, + "PronType_prs": PronType_prs, + "PronType_rcp": PronType_rcp, + "PronType_rel": PronType_rel, + "PronType_tot": PronType_tot, + "PronType_clit": PronType_clit, + "PronType_exc": PronType_exc, + "end_PronType": end_PronType, + "begin_Reflex": begin_Reflex, + "Reflex_yes": Reflex_yes, + "end_Reflex": end_Reflex, + "begin_Tense": begin_Tense, + "Tense_fut": Tense_fut, + "Tense_imp": Tense_imp, + "Tense_past": Tense_past, + "Tense_pres": Tense_pres, + "end_Tense": end_Tense, + "begin_VerbForm": begin_VerbForm, + "VerbForm_fin": VerbForm_fin, + "VerbForm_ger": VerbForm_ger, + "VerbForm_inf": VerbForm_inf, + "VerbForm_none": VerbForm_none, + "VerbForm_part": VerbForm_part, + "VerbForm_partFut": VerbForm_partFut, + "VerbForm_partPast": VerbForm_partPast, + "VerbForm_partPres": VerbForm_partPres, + "VerbForm_sup": VerbForm_sup, + "VerbForm_trans": VerbForm_trans, + "VerbForm_conv": VerbForm_conv, + "VerbForm_gdv": VerbForm_gdv, + "end_VerbForm": end_VerbForm, + "begin_Voice": begin_Voice, + "Voice_act": Voice_act, + "Voice_cau": Voice_cau, + "Voice_pass": Voice_pass, + "Voice_mid": Voice_mid, + "Voice_int": Voice_int, + "end_Voice": end_Voice, + "begin_Derivation": begin_Derivation, + "Derivation_minen": Derivation_minen, + "Derivation_sti": Derivation_sti, + "Derivation_inen": Derivation_inen, + "Derivation_lainen": Derivation_lainen, + "Derivation_ja": Derivation_ja, + "Derivation_ton": Derivation_ton, + "Derivation_vs": Derivation_vs, + "Derivation_ttain": Derivation_ttain, + "Derivation_ttaa": Derivation_ttaa, + "end_Derivation": end_Derivation, + "begin_Echo": begin_Echo, + "Echo_rdp": Echo_rdp, + "Echo_ech": Echo_ech, + "end_Echo": end_Echo, + "begin_Foreign": begin_Foreign, + "Foreign_foreign": Foreign_foreign, + "Foreign_fscript": Foreign_fscript, + "Foreign_tscript": Foreign_tscript, + "Foreign_yes": Foreign_yes, + "end_Foreign": end_Foreign, + "begin_Hyph": begin_Hyph, + "Hyph_yes": Hyph_yes, + "end_Hyph": end_Hyph, + "begin_InfForm": begin_InfForm, + "InfForm_one": InfForm_one, + "InfForm_two": InfForm_two, + "InfForm_three": InfForm_three, + "end_InfForm": end_InfForm, + "begin_NameType": begin_NameType, + "NameType_geo": NameType_geo, + "NameType_prs": NameType_prs, + "NameType_giv": NameType_giv, + "NameType_sur": NameType_sur, + "NameType_nat": NameType_nat, + "NameType_com": NameType_com, + "NameType_pro": NameType_pro, + "NameType_oth": NameType_oth, + "end_NameType": end_NameType, + "begin_NounType": begin_NounType, + "NounType_com": NounType_com, + "NounType_prop": NounType_prop, + "NounType_class": NounType_class, + "end_NounType": end_NounType, + "begin_NumForm": begin_NumForm, + "NumForm_digit": NumForm_digit, + "NumForm_roman": NumForm_roman, + "NumForm_word": NumForm_word, + "end_NumForm": end_NumForm, + "begin_NumValue": begin_NumValue, + "NumValue_one": NumValue_one, + "NumValue_two": NumValue_two, + "NumValue_three": NumValue_three, + "end_NumValue": end_NumValue, + "begin_PartForm": begin_PartForm, + "PartForm_pres": PartForm_pres, + "PartForm_past": PartForm_past, + "PartForm_agt": PartForm_agt, + "PartForm_neg": PartForm_neg, + "end_PartForm": end_PartForm, + "begin_PartType": begin_PartType, + "PartType_mod": PartType_mod, + "PartType_emp": PartType_emp, + "PartType_res": PartType_res, + "PartType_inf": PartType_inf, + "PartType_vbp": PartType_vbp, + "end_PartType": end_PartType, + "begin_Polite": begin_Polite, + "Polite_inf": Polite_inf, + "Polite_pol": Polite_pol, + "Polite_abs_inf": Polite_abs_inf, + "Polite_abs_pol": Polite_abs_pol, + "Polite_erg_inf": Polite_erg_inf, + "Polite_erg_pol": Polite_erg_pol, + "Polite_dat_inf": Polite_dat_inf, + "Polite_dat_pol": Polite_dat_pol, + "end_Polite": end_Polite, + "begin_Prefix": begin_Prefix, + "Prefix_yes": Prefix_yes, + "end_Prefix": end_Prefix, + "begin_PrepCase": begin_PrepCase, + "PrepCase_npr": PrepCase_npr, + "PrepCase_pre": PrepCase_pre, + "end_PrepCase": end_PrepCase, + "begin_PunctSide": begin_PunctSide, + "PunctSide_ini": PunctSide_ini, + "PunctSide_fin": PunctSide_fin, + "end_PunctSide": end_PunctSide, + "begin_PunctType": begin_PunctType, + "PunctType_peri": PunctType_peri, + "PunctType_qest": PunctType_qest, + "PunctType_excl": PunctType_excl, + "PunctType_quot": PunctType_quot, + "PunctType_brck": PunctType_brck, + "PunctType_comm": PunctType_comm, + "PunctType_colo": PunctType_colo, + "PunctType_semi": PunctType_semi, + "PunctType_dash": PunctType_dash, + "end_PunctType": end_PunctType, + "begin_Style": begin_Style, + "Style_arch": Style_arch, + "Style_rare": Style_rare, + "Style_poet": Style_poet, + "Style_norm": Style_norm, + "Style_coll": Style_coll, + "Style_vrnc": Style_vrnc, + "Style_sing": Style_sing, + "Style_expr": Style_expr, + "Style_derg": Style_derg, + "Style_vulg": Style_vulg, + "Style_yes": Style_yes, + "end_Style": end_Style, + "begin_StyleVariant": begin_StyleVariant, + "StyleVariant_styleShort": StyleVariant_styleShort, + "StyleVariant_styleBound": StyleVariant_styleBound, + "end_StyleVariant": end_StyleVariant, + "begin_VerbType": begin_VerbType, + "VerbType_aux": VerbType_aux, + "VerbType_cop": VerbType_cop, + "VerbType_mod": VerbType_mod, + "VerbType_light": VerbType_light, + "end_VerbType": end_VerbType, +} + + +FIELD_SIZES = [get_field_size(field) for field in FIELDS] NAMES = {value: key for key, value in IDS.items()} # Unfortunate hack here, to work around problem with long cpdef enum From 3b6b018904b9ef8dbabd336027df7e2a9fc7424b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 26 Sep 2018 21:01:48 +0200 Subject: [PATCH 025/207] Fix loading of gold morphology --- spacy/gold.pyx | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 65a3932be..77c5944ca 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -429,17 +429,17 @@ cdef class GoldParse: if words is None: words = [token.text for token in doc] if tags is None: - tags = [None for _ in doc] + tags = [None for _ in words] if heads is None: - heads = [None for token in doc] + heads = [None for token in words] if deps is None: - deps = [None for _ in doc] + deps = [None for _ in words] if entities is None: - entities = [None for _ in doc] + entities = [None for _ in words] if morphology is None: - morphology = [None for _ in doc] + morphology = [None for _ in words] elif len(entities) == 0: - entities = ['O' for _ in doc] + entities = ['O' for _ in words] elif not isinstance(entities[0], basestring): # Assume we have entities specified by character offset. entities = biluo_tags_from_offsets(doc, entities) @@ -532,6 +532,7 @@ cdef class GoldParse: else: self.words[i] = words[gold_i] self.tags[i] = tags[gold_i] + self.morphology[i] = morphology[gold_i] if heads[gold_i] is None: self.heads[i] = None else: From 1f9f834dc08bdb2d1bcd2eb63eb953336f1b4b78 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 26 Sep 2018 21:02:13 +0200 Subject: [PATCH 026/207] Fix morphologizer --- spacy/_morphologizer.pyx | 65 ++++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 23 deletions(-) diff --git a/spacy/_morphologizer.pyx b/spacy/_morphologizer.pyx index a3d3a301a..10baec8f5 100644 --- a/spacy/_morphologizer.pyx +++ b/spacy/_morphologizer.pyx @@ -20,7 +20,7 @@ from .compat import json_dumps, basestring_ from .tokens.doc cimport Doc from .vocab cimport Vocab from .morphology cimport Morphology -from .morphology import parse_feature +from .morphology import parse_feature, IDS, FIELDS, FIELD_SIZES, NAMES from .pipeline import Pipe @@ -28,9 +28,11 @@ class Morphologizer(Pipe): name = 'morphologizer' @classmethod - def Model(cls, attr_nums, **cfg): + def Model(cls, attr_nums=None, **cfg): if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'): raise ValueError(TempErrors.T008) + if attr_nums is None: + attr_nums = list(FIELD_SIZES) return build_morphologizer_model(attr_nums, **cfg) def __init__(self, vocab, model=True, **cfg): @@ -71,29 +73,34 @@ class Morphologizer(Pipe): return guesses, tokvecs tokvecs = self.model.tok2vec(docs) scores = self.model.softmax(tokvecs) - guesses = [] - # Resolve multisoftmax into guesses - for doc_scores in scores: - guesses.append(scores_to_guesses(doc_scores, self.model.softmax.out_sizes)) - return guesses, tokvecs + return scores, tokvecs - def set_annotations(self, docs, batch_feature_ids, tensors=None): + def set_annotations(self, docs, batch_scores, tensors=None): if isinstance(docs, Doc): docs = [docs] cdef Doc doc cdef Vocab vocab = self.vocab + field_names = list(FIELDS) + offsets = [IDS['begin_%s' % field] for field in field_names] for i, doc in enumerate(docs): - doc_feat_ids = batch_feature_ids[i] - if hasattr(doc_feat_ids, 'get'): - doc_feat_ids = doc_feat_ids.get() + doc_scores = batch_scores[i] + doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes) # Convert the neuron indices into feature IDs. - offset = self.vocab.morphology.first_feature - for j, nr_feat in enumerate(self.model.softmax.out_sizes): - doc_feat_ids[:, j] += offset - offset += nr_feat - # Now add the analysis, and set the hash. - for j in range(doc_feat_ids.shape[0]): - doc.c[j].morph = self.vocab.morphology.add(doc_feat_ids[j]) + doc_feat_ids = self.model.ops.allocate((len(doc), len(field_names)), dtype='i') + for j in range(len(doc)): + for k, offset in enumerate(offsets): + if doc_guesses[j, k] == 0: + doc_feat_ids[j, k] = 0 + else: + doc_feat_ids[j, k] = offset + doc_guesses[j, k] + # Now add the analysis, and set the hash. + try: + doc.c[j].morph = self.vocab.morphology.add(doc_feat_ids[j]) + except: + print(offsets) + print(doc_guesses[j]) + print(doc_feat_ids[j]) + raise def update(self, docs, golds, drop=0., sgd=None, losses=None): if losses is not None and self.name not in losses: @@ -110,17 +117,27 @@ class Morphologizer(Pipe): guesses = [] for doc_scores in scores: guesses.append(scores_to_guesses(doc_scores, self.model.softmax.out_sizes)) - guesses = self.model.ops.flatten(guesses) + guesses = self.model.ops.xp.vstack(guesses) + scores = self.model.ops.xp.vstack(scores) cdef int idx = 0 target = numpy.zeros(scores.shape, dtype='f') + field_sizes = self.model.softmax.out_sizes for gold in golds: for features in gold.morphology: if features is None: - target[idx] = guesses[idx] + target[idx] = scores[idx] else: + by_field = {} for feature in features: - _, column = parse_feature(feature) - target[idx, column] = 1 + field, column = parse_feature(feature) + by_field[field] = column + col_offset = 0 + for field, field_size in enumerate(field_sizes): + if field in by_field: + target[idx, col_offset + by_field[field]] = 1. + else: + target[idx, col_offset] = 1. + col_offset += field_size idx += 1 target = self.model.ops.xp.array(target, dtype='f') d_scores = scores - target @@ -137,6 +154,8 @@ def scores_to_guesses(scores, out_sizes): guesses = xp.zeros((scores.shape[0], len(out_sizes)), dtype='i') offset = 0 for i, size in enumerate(out_sizes): - guesses[:, i] = scores[:, offset : offset + size].argmax(axis=1) + slice_ = scores[:, offset : offset + size] + col_guesses = slice_.argmax(axis=1) + guesses[:, i] = col_guesses offset += size return guesses From f03640b41ff94f42c38b21d1247e29a53a0dbb1a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 26 Sep 2018 21:02:42 +0200 Subject: [PATCH 027/207] Fix morphology task in ud-train --- spacy/cli/ud_train.py | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/spacy/cli/ud_train.py b/spacy/cli/ud_train.py index 9a0b5e10c..c310c6616 100644 --- a/spacy/cli/ud_train.py +++ b/spacy/cli/ud_train.py @@ -84,10 +84,12 @@ def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False, if oracle_segments: docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces'])) golds.append(GoldParse(docs[-1], **sent)) + assert golds[-1].morphology is not None sent_annots.append(sent) if raw_text and max_doc_length and len(sent_annots) >= max_doc_length: doc, gold = _make_gold(nlp, None, sent_annots) + assert gold.morphology is not None sent_annots = [] docs.append(doc) golds.append(gold) @@ -104,12 +106,13 @@ def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False, def _parse_morph_string(morph_string): if morph_string == '_': - return None + return set() output = [] replacements = {'1': 'one', '2': 'two', '3': 'three'} for feature in morph_string.split('|'): key, value = feature.split('=') value = replacements.get(value, value) + value = value.split(',')[0] output.append('%s_%s' % (key, value.lower())) return set(output) @@ -146,7 +149,7 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0): sent_starts = [] for sent in sent_annots: flat['heads'].extend(len(flat['words'])+head for head in sent['heads']) - for field in ['words', 'tags', 'deps', 'entities', 'spaces']: + for field in ['words', 'tags', 'deps', 'morphology', 'entities', 'spaces']: flat[field].extend(sent[field]) sent_starts.append(True) sent_starts.extend([False] * (len(sent['words'])-1)) @@ -238,22 +241,26 @@ def write_conllu(docs, file_): def print_progress(itn, losses, ud_scores): fields = { 'dep_loss': losses.get('parser', 0.0), + 'morph_loss': losses.get('morphologizer', 0.0), 'tag_loss': losses.get('tagger', 0.0), 'words': ud_scores['Words'].f1 * 100, 'sents': ud_scores['Sentences'].f1 * 100, 'tags': ud_scores['XPOS'].f1 * 100, 'uas': ud_scores['UAS'].f1 * 100, 'las': ud_scores['LAS'].f1 * 100, + 'morph': ud_scores['Feats'].f1 * 100, } - header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD'] + header = ['Epoch', 'P.Loss', 'M.Loss', 'LAS', 'UAS', 'TAG', 'MORPH', 'SENT', 'WORD'] if itn == 0: print('\t'.join(header)) tpl = '\t'.join(( '{:d}', '{dep_loss:.1f}', + '{morph_loss:.1f}', '{las:.1f}', '{uas:.1f}', '{tags:.1f}', + '{morph:.1f}', '{sents:.1f}', '{words:.1f}', )) @@ -275,7 +282,19 @@ def get_token_conllu(token, i): head = 0 else: head = i + (token.head.i - token.i) + 1 - fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_', + features = token.vocab.morphology.get(token.morph_key) + feat_str = [] + replacements = {'one': '1', 'two': '2', 'three': '3'} + for feat in features: + if not feat.startswith('begin') and not feat.startswith('end'): + key, value = feat.split('_') + value = replacements.get(value, value) + feat_str.append('%s=%s' % (key, value.title())) + if not feat_str: + feat_str = '_' + else: + feat_str = '|'.join(feat_str) + fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, feat_str, str(head), token.dep_.lower(), '_', '_'] lines.append('\t'.join(fields)) return '\n'.join(lines) @@ -305,6 +324,7 @@ def load_nlp(corpus, config, vectors=None): def initialize_pipeline(nlp, docs, golds, config, device): nlp.add_pipe(nlp.create_pipe('tagger')) + nlp.add_pipe(nlp.create_pipe('morphologizer')) nlp.add_pipe(nlp.create_pipe('parser')) if config.multitask_tag: nlp.parser.add_multitask_objective('tag') @@ -437,11 +457,11 @@ def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vector with nlp.use_params(optimizer.averages): if use_oracle_segments: parsed_docs, scores = evaluate(nlp, paths.dev.conllu, - paths.dev.conllu, out_path) + paths.dev.conllu, out_path) else: parsed_docs, scores = evaluate(nlp, paths.dev.text, - paths.dev.conllu, out_path) - print_progress(i, losses, scores) + paths.dev.conllu, out_path) + print_progress(i, losses, scores) def _render_parses(i, to_render): From 6f983132544e1d20684930b7157d92a70f16a32c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 26 Sep 2018 21:03:03 +0200 Subject: [PATCH 028/207] Fix disjunctive features in English tag map --- spacy/lang/en/tag_map.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py index fc3d2cc93..ffc1739cb 100644 --- a/spacy/lang/en/tag_map.py +++ b/spacy/lang/en/tag_map.py @@ -52,10 +52,10 @@ TAG_MAP = { "VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"}, "VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"}, "VBZ": {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3}, - "WDT": {POS: ADJ, "PronType": "int|rel"}, - "WP": {POS: NOUN, "PronType": "int|rel"}, - "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, - "WRB": {POS: ADV, "PronType": "int|rel"}, + "WDT": {POS: ADJ, "PronType": "int,rel"}, + "WP": {POS: NOUN, "PronType": "int,rel"}, + "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int,rel"}, + "WRB": {POS: ADV, "PronType": "int,rel"}, "ADD": {POS: X}, "NFP": {POS: PUNCT}, "GW": {POS: X}, From 63502349294e0ee34a3a76ec201eefd702c7583a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 26 Sep 2018 21:03:20 +0200 Subject: [PATCH 029/207] Add morphologizer pipeline component to Language --- spacy/language.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/language.py b/spacy/language.py index e64768d05..8ba169833 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -19,6 +19,7 @@ from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens from .pipeline import EntityRuler +from ._morphologizer import Morphologizer from .compat import json_dumps, izip, basestring_ from .gold import GoldParse from .scorer import Scorer @@ -103,6 +104,7 @@ class Language(object): 'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp), 'tensorizer': lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg), 'tagger': lambda nlp, **cfg: Tagger(nlp.vocab, **cfg), + 'morphologizer': lambda nlp, **cfg: Morphologizer(nlp.vocab, **cfg), 'parser': lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg), 'ner': lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg), 'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), From 022dcda9643c3d07b0e8bfa824bf873ffd2247b4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 26 Sep 2018 21:03:44 +0200 Subject: [PATCH 030/207] Fix morphology enum --- spacy/morphology.pxd | 319 +++++++++++++++++++++++++------------------ 1 file changed, 183 insertions(+), 136 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index bc8c44417..adc5e5574 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -32,18 +32,22 @@ cdef class Morphology: cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1 + cdef enum univ_morph_t: NIL = 0 + begin_Abbr - Abbr_yes # cz, fi, sl, U + Abbr_yes end_Abbr + begin_AdpType - AdpType_circ # U - AdpType_comprep # cz - AdpType_prep # cz, U - AdpType_post # U - AdpType_voc # cz + AdpType_circ + AdpType_comprep + AdpType_prep + AdpType_post + AdpType_voc end_AdpType + begin_AdvType AdvType_adadj AdvType_cau @@ -55,12 +59,14 @@ cdef enum univ_morph_t: AdvType_sta AdvType_tim end_AdvType + begin_Animacy - Animacy_anim = symbols.Animacy_anim + Animacy_anim Animacy_hum Animacy_inan Animacy_nhum end_Animacy + begin_Aspect Aspect_freq Aspect_imp @@ -68,6 +74,7 @@ cdef enum univ_morph_t: Aspect_none Aspect_perf end_Aspect + begin_Case Case_abe Case_abl @@ -97,6 +104,7 @@ cdef enum univ_morph_t: Case_tra Case_voc end_Case + begin_ConjType ConjType_comp # cz, U ConjType_oper # cz, U @@ -104,6 +112,7 @@ cdef enum univ_morph_t: begin_Connegative Connegative_yes # fi end_Connegative + begin_Definite Definite_cons # U20 Definite_def @@ -111,6 +120,7 @@ cdef enum univ_morph_t: Definite_red Definite_two end_Definite + begin_Degree Degree_abs Degree_cmp @@ -121,6 +131,31 @@ cdef enum univ_morph_t: Degree_com Degree_dim # du end_Degree + + begin_Derivation + Derivation_minen # fi + Derivation_sti # fi + Derivation_inen # fi + Derivation_lainen # fi + Derivation_ja # fi + Derivation_ton # fi + Derivation_vs # fi + Derivation_ttain # fi + Derivation_ttaa # fi + end_Derivation + + begin_Echo + Echo_rdp # U + Echo_ech # U + end_Echo + + begin_Foreign + Foreign_foreign # cz, fi, U + Foreign_fscript # cz, fi, U + Foreign_tscript # cz, U + Foreign_yes # sl + end_Foreign + begin_Gender Gender_com Gender_fem @@ -133,8 +168,18 @@ cdef enum univ_morph_t: Gender_psor_masc # cz, sl, U Gender_psor_fem # cz, sl, U Gender_psor_neut # sl - end_Gender + + begin_Hyph + Hyph_yes # cz, U + end_Hyph + + begin_InfForm + InfForm_one # fi + InfForm_two # fi + InfForm_three # fi + end_InfForm + begin_Mood Mood_cnd Mood_imp @@ -144,15 +189,30 @@ cdef enum univ_morph_t: Mood_sub Mood_opt end_Mood + + begin_NameType + NameType_geo # U, cz + NameType_prs # U, cz + NameType_giv # U, cz + NameType_sur # U, cz + NameType_nat # U, cz + NameType_com # U, cz + NameType_pro # U, cz + NameType_oth # U, cz + end_NameType + begin_Negative Negative_neg Negative_pos Negative_yes end_Negative - begin_Polarity - Polarity_neg # U20 - Polarity_pos # U20 - end_Polarity + + begin_NounType + NounType_com # U + NounType_prop # U + NounType_class # U + end_NounType + begin_Number Number_com Number_dual @@ -171,8 +231,14 @@ cdef enum univ_morph_t: Number_psee_plur # U Number_psor_sing # cz, fi, sl, U Number_psor_plur # cz, fi, sl, U - end_Number + + begin_NumForm + NumForm_digit # cz, sl, U + NumForm_roman # cz, sl, U + NumForm_word # cz, sl, U + end_NumForm + begin_NumType NumType_card NumType_dist @@ -183,7 +249,29 @@ cdef enum univ_morph_t: NumType_ord NumType_sets end_NumType - begin_Person + + begin_NumValue + NumValue_one # cz, U + NumValue_two # cz, U + NumValue_three # cz, U + end_NumValue + + begin_PartForm + PartForm_pres # fi + PartForm_past # fi + PartForm_agt # fi + PartForm_neg # fi + end_PartForm + + begin_PartType + PartType_mod # U + PartType_emp # U + PartType_res # U + PartType_inf # U + PartType_vbp # U + end_PartType + + begin_Person Person_one Person_two Person_three @@ -201,9 +289,36 @@ cdef enum univ_morph_t: Person_psor_two # fi, U Person_psor_three # fi, U end_Person + + begin_Polarity + Polarity_neg # U20 + Polarity_pos # U20 + end_Polarity + + begin_Polite + Polite_inf # bq, U + Polite_pol # bq, U + Polite_abs_inf # bq, U + Polite_abs_pol # bq, U + Polite_erg_inf # bq, U + Polite_erg_pol # bq, U + Polite_dat_inf # bq, U + Polite_dat_pol # bq, U + end_Polite + begin_Poss Poss_yes end_Poss + + begin_Prefix + Prefix_yes # U + end_Prefix + + begin_PrepCase + PrepCase_npr # cz + PrepCase_pre # U + end_PrepCase + begin_PronType PronType_advPart PronType_art @@ -219,15 +334,58 @@ cdef enum univ_morph_t: PronType_clit PronType_exc # es, ca, it, fa end_PronType + + begin_PunctSide + PunctSide_ini # U + PunctSide_fin # U + end_PunctSide + + begin_PunctType + PunctType_peri # U + PunctType_qest # U + PunctType_excl # U + PunctType_quot # U + PunctType_brck # U + PunctType_comm # U + PunctType_colo # U + PunctType_semi # U + PunctType_dash # U + end_PunctType + begin_Reflex Reflex_yes end_Reflex + + begin_Style + Style_arch # cz, fi, U + Style_rare # cz, fi, U + Style_poet # cz, U + Style_norm # cz, U + Style_coll # cz, U + Style_vrnc # cz, U + Style_sing # cz, U + Style_expr # cz, U + Style_derg # cz, U + Style_vulg # cz, U + Style_yes # fi, U + end_Style + + begin_StyleVariant + StyleVariant_styleShort # cz + StyleVariant_styleBound # cz, sl + end_StyleVariant + begin_Tense Tense_fut Tense_imp Tense_past Tense_pres end_Tense + + begin_Typo + Typo_yes + end_Typo + begin_VerbForm VerbForm_fin VerbForm_ger @@ -242,6 +400,14 @@ cdef enum univ_morph_t: VerbForm_conv # U20 VerbForm_gdv # la end_VerbForm + + begin_VerbType + VerbType_aux # U + VerbType_cop # U + VerbType_mod # U + VerbType_light # U + end_VerbType + begin_Voice Voice_act Voice_cau @@ -249,128 +415,7 @@ cdef enum univ_morph_t: Voice_mid # gkc Voice_int # hb end_Voice - begin_Derivation - Derivation_minen # fi - Derivation_sti # fi - Derivation_inen # fi - Derivation_lainen # fi - Derivation_ja # fi - Derivation_ton # fi - Derivation_vs # fi - Derivation_ttain # fi - Derivation_ttaa # fi - end_Derivation - begin_Echo - Echo_rdp # U - Echo_ech # U - end_Echo - begin_Foreign - Foreign_foreign # cz, fi, U - Foreign_fscript # cz, fi, U - Foreign_tscript # cz, U - Foreign_yes # sl - end_Foreign - begin_Hyph - Hyph_yes # cz, U - end_Hyph - begin_InfForm - InfForm_one # fi - InfForm_two # fi - InfForm_three # fi - end_InfForm - begin_NameType - NameType_geo # U, cz - NameType_prs # U, cz - NameType_giv # U, cz - NameType_sur # U, cz - NameType_nat # U, cz - NameType_com # U, cz - NameType_pro # U, cz - NameType_oth # U, cz - end_NameType - begin_NounType - NounType_com # U - NounType_prop # U - NounType_class # U - end_NounType - begin_NumForm - NumForm_digit # cz, sl, U - NumForm_roman # cz, sl, U - NumForm_word # cz, sl, U - end_NumForm - begin_NumValue - NumValue_one # cz, U - NumValue_two # cz, U - NumValue_three # cz, U - end_NumValue - begin_PartForm - PartForm_pres # fi - PartForm_past # fi - PartForm_agt # fi - PartForm_neg # fi - end_PartForm - begin_PartType - PartType_mod # U - PartType_emp # U - PartType_res # U - PartType_inf # U - PartType_vbp # U - end_PartType - begin_Polite - Polite_inf # bq, U - Polite_pol # bq, U - Polite_abs_inf # bq, U - Polite_abs_pol # bq, U - Polite_erg_inf # bq, U - Polite_erg_pol # bq, U - Polite_dat_inf # bq, U - Polite_dat_pol # bq, U - end_Polite - begin_Prefix - Prefix_yes # U - end_Prefix - begin_PrepCase - PrepCase_npr # cz - PrepCase_pre # U - end_PrepCase - begin_PunctSide - PunctSide_ini # U - PunctSide_fin # U - end_PunctSide - begin_PunctType - PunctType_peri # U - PunctType_qest # U - PunctType_excl # U - PunctType_quot # U - PunctType_brck # U - PunctType_comm # U - PunctType_colo # U - PunctType_semi # U - PunctType_dash # U - end_PunctType - begin_Style - Style_arch # cz, fi, U - Style_rare # cz, fi, U - Style_poet # cz, U - Style_norm # cz, U - Style_coll # cz, U - Style_vrnc # cz, U - Style_sing # cz, U - Style_expr # cz, U - Style_derg # cz, U - Style_vulg # cz, U - Style_yes # fi, U - end_Style - begin_StyleVariant - StyleVariant_styleShort # cz - StyleVariant_styleBound # cz, sl - end_StyleVariant - begin_VerbType - VerbType_aux # U - VerbType_cop # U - VerbType_mod # U - VerbType_light # U - end_VerbType + cdef struct RichTagC: univ_pos_t pos @@ -395,6 +440,7 @@ cdef struct RichTagC: univ_morph_t negative univ_morph_t number univ_morph_t name_type + univ_morph_t noun_type univ_morph_t num_form univ_morph_t num_type univ_morph_t num_value @@ -413,6 +459,7 @@ cdef struct RichTagC: univ_morph_t style univ_morph_t style_variant univ_morph_t tense + univ_morph_t typo univ_morph_t verb_form univ_morph_t voice univ_morph_t verb_type From 2b8a53ebdcfd6704f8f943faa4bbf2118247971a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 26 Sep 2018 21:03:57 +0200 Subject: [PATCH 031/207] Fix morphology functions --- spacy/morphology.pyx | 511 ++++++++++++++++++++++++++----------------- 1 file changed, 309 insertions(+), 202 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 870f05a87..ee747cf3c 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -18,19 +18,8 @@ from .errors import Errors def _normalize_props(props): """Transform deprecated string keys to correct names.""" out = {} - morph_keys = [ - 'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number', - 'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss', - 'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType', - 'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr', - 'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm', - 'NumValue', 'PartType', 'Polite', 'StyleVariant', - 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType', - 'Reflex', 'Negative', 'Mood', 'Aspect', 'Case', - 'Polarity', 'PrepCase', 'Animacy' # U20 - ] props = dict(props) - for key in morph_keys: + for key in FIELDS: if key in props: attr = '%s_%s' % (key, props[key]) if attr in IDS: @@ -57,6 +46,7 @@ def parse_feature(feature): feature = NAMES[feature] key, value = feature.split('_') begin = 'begin_%s' % key + # Note that this includes a 0 offset for the field, for no entry offset = IDS[feature] - IDS[begin] field_id = FIELDS[key] return (field_id, offset) @@ -65,7 +55,8 @@ def parse_feature(feature): def get_field_size(field): begin = 'begin_%s' % field end = 'end_%s' % field - return (IDS[end] - IDS[begin]) - 1 + # Extra field for no entry -- always 0 + return IDS[end] - IDS[begin] cdef class Morphology: @@ -113,9 +104,23 @@ cdef class Morphology: present. Returns the hash of the new analysis. """ features = intify_features(features) - cdef RichTagC tag = create_rich_tag(features) + cdef univ_morph_t feature + for feature in features: + if feature != 0 and feature not in NAMES: + print(list(NAMES.keys())[:10]) + print(NAMES.get(feature-1), NAMES.get(feature+1)) + raise KeyError("Unknown feature: %d" % feature) + cdef RichTagC tag + tag = create_rich_tag(features) cdef hash_t key = self.insert(tag) return key + + def get(self, hash_t morph): + tag = <RichTagC*>self.tags.get(morph) + if tag == NULL: + return [] + else: + return tag_to_json(tag[0]) cpdef update(self, hash_t morph, features): """Update a morphological analysis with new feature values.""" @@ -127,8 +132,6 @@ cdef class Morphology: morph = self.insert(tag) return morph - - def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): if orth not in self.strings: return orth @@ -205,7 +208,8 @@ cdef class Morphology: token.lemma = lemma token.pos = <univ_pos_t>pos token.tag = self.strings[tag_str] - token.morph = self.add(features) + #token.morph = self.add(features) + token.morph = 0 if (self.tag_names[tag_id], token.lex.orth) in self.exc: self._assign_tag_from_exceptions(token, tag_id) @@ -228,7 +232,7 @@ cdef class Morphology: tag_ptr = <RichTagC*>self.tags.get(key) if tag_ptr != NULL: json_tags.append(tag_to_json(tag_ptr[0])) - raise json.dumps(json_tags) + return json.dumps(json_tags) def from_bytes(self, byte_string): raise NotImplementedError @@ -249,7 +253,7 @@ cpdef intify_features(features): cdef hash_t hash_tag(RichTagC tag) nogil: return mrmr.hash64(&tag, sizeof(tag), 0) -cdef RichTagC create_rich_tag(features): +cdef RichTagC create_rich_tag(features) except *: cdef RichTagC tag cdef univ_morph_t feature memset(&tag, 0, sizeof(tag)) @@ -258,20 +262,105 @@ cdef RichTagC create_rich_tag(features): return tag cdef tag_to_json(RichTagC tag): - return {} + features = [] + if tag.abbr != 0: + features.append(NAMES[tag.abbr]) + if tag.adp_type != 0: + features.append(NAMES[tag.adp_type]) + if tag.adv_type != 0: + features.append(NAMES[tag.adv_type]) + if tag.animacy != 0: + features.append(NAMES[tag.animacy]) + if tag.aspect != 0: + features.append(NAMES[tag.aspect]) + if tag.case != 0: + features.append(NAMES[tag.case]) + if tag.conj_type != 0: + features.append(NAMES[tag.conj_type]) + if tag.connegative != 0: + features.append(NAMES[tag.connegative]) + if tag.definite != 0: + features.append(NAMES[tag.definite]) + if tag.degree != 0: + features.append(NAMES[tag.degree]) + if tag.derivation != 0: + features.append(NAMES[tag.derivation]) + if tag.echo != 0: + features.append(NAMES[tag.echo]) + if tag.foreign != 0: + features.append(NAMES[tag.foreign]) + if tag.gender != 0: + features.append(NAMES[tag.gender]) + if tag.hyph != 0: + features.append(NAMES[tag.hyph]) + if tag.inf_form != 0: + features.append(NAMES[tag.inf_form]) + if tag.mood != 0: + features.append(NAMES[tag.mood]) + if tag.negative != 0: + features.append(NAMES[tag.negative]) + if tag.number != 0: + features.append(NAMES[tag.number]) + if tag.name_type != 0: + features.append(NAMES[tag.name_type]) + if tag.noun_type != 0: + features.append(NAMES[tag.noun_type]) + if tag.num_form != 0: + features.append(NAMES[tag.num_form]) + if tag.num_type != 0: + features.append(NAMES[tag.num_type]) + if tag.num_value != 0: + features.append(NAMES[tag.num_value]) + if tag.part_form != 0: + features.append(NAMES[tag.part_form]) + if tag.part_type != 0: + features.append(NAMES[tag.part_type]) + if tag.person != 0: + features.append(NAMES[tag.person]) + if tag.polite != 0: + features.append(NAMES[tag.polite]) + if tag.polarity != 0: + features.append(NAMES[tag.polarity]) + if tag.poss != 0: + features.append(NAMES[tag.poss]) + if tag.prefix != 0: + features.append(NAMES[tag.prefix]) + if tag.prep_case != 0: + features.append(NAMES[tag.prep_case]) + if tag.pron_type != 0: + features.append(NAMES[tag.pron_type]) + if tag.punct_side != 0: + features.append(NAMES[tag.punct_side]) + if tag.punct_type != 0: + features.append(NAMES[tag.punct_type]) + if tag.reflex != 0: + features.append(NAMES[tag.reflex]) + if tag.style != 0: + features.append(NAMES[tag.style]) + if tag.style_variant != 0: + features.append(NAMES[tag.style_variant]) + if tag.tense != 0: + features.append(NAMES[tag.tense]) + if tag.verb_form != 0: + features.append(NAMES[tag.verb_form]) + if tag.voice != 0: + features.append(NAMES[tag.voice]) + if tag.verb_type != 0: + features.append(NAMES[tag.verb_type]) + return features cdef RichTagC tag_from_json(json_tag): cdef RichTagC tag return tag -cdef int set_feature(RichTagC* tag, univ_morph_t feature, int value) nogil: +cdef int set_feature(RichTagC* tag, univ_morph_t feature, int value) except -1: if value == True: value_ = feature else: value_ = NIL if feature == NIL: pass - if is_abbr_feature(feature): + elif is_abbr_feature(feature): tag.abbr = value_ elif is_adp_type_feature(feature): tag.adp_type = value_ @@ -311,8 +400,12 @@ cdef int set_feature(RichTagC* tag, univ_morph_t feature, int value) nogil: tag.number = value_ elif is_name_type_feature(feature): tag.name_type = value_ + elif is_noun_type_feature(feature): + tag.noun_type = value_ elif is_num_form_feature(feature): tag.num_form = value_ + elif is_num_type_feature(feature): + tag.num_type = value_ elif is_num_value_feature(feature): tag.num_value = value_ elif is_part_form_feature(feature): @@ -334,6 +427,8 @@ cdef int set_feature(RichTagC* tag, univ_morph_t feature, int value) nogil: elif is_pron_type_feature(feature): tag.pron_type = value_ elif is_punct_side_feature(feature): + tag.punct_side = value_ + elif is_punct_type_feature(feature): tag.punct_type = value_ elif is_reflex_feature(feature): tag.reflex = value_ @@ -343,6 +438,8 @@ cdef int set_feature(RichTagC* tag, univ_morph_t feature, int value) nogil: tag.style_variant = value_ elif is_tense_feature(feature): tag.tense = value_ + elif is_typo_feature(feature): + tag.typo = value_ elif is_verb_form_feature(feature): tag.verb_form = value_ elif is_voice_feature(feature): @@ -350,131 +447,136 @@ cdef int set_feature(RichTagC* tag, univ_morph_t feature, int value) nogil: elif is_verb_type_feature(feature): tag.verb_type = value_ else: - with gil: - raise ValueError("Unknown feature: %d" % feature) + raise ValueError("Unknown feature: %s (%d)" % (NAMES.get(feature), feature)) cdef int is_abbr_feature(univ_morph_t feature) nogil: - return feature > begin_Abbr and feature < end_Abbr + return feature >= begin_Abbr and feature <= end_Abbr cdef int is_adp_type_feature(univ_morph_t feature) nogil: - return feature > begin_AdpType and feature < end_AdpType + return feature >= begin_AdpType and feature <= end_AdpType cdef int is_adv_type_feature(univ_morph_t feature) nogil: - return feature > begin_AdvType and feature < end_AdvType + return feature >= begin_AdvType and feature <= end_AdvType cdef int is_animacy_feature(univ_morph_t feature) nogil: - return feature > begin_Animacy and feature < end_Animacy + return feature >= begin_Animacy and feature <= end_Animacy cdef int is_aspect_feature(univ_morph_t feature) nogil: - return feature > begin_Aspect and feature < end_Aspect + return feature >= begin_Aspect and feature <= end_Aspect cdef int is_case_feature(univ_morph_t feature) nogil: - return feature > begin_Case and feature < end_Case + return feature >= begin_Case and feature <= end_Case cdef int is_conj_type_feature(univ_morph_t feature) nogil: - return feature > begin_ConjType and feature < end_ConjType + return feature >= begin_ConjType and feature <= end_ConjType cdef int is_connegative_feature(univ_morph_t feature) nogil: - return feature > begin_Connegative and feature < end_Connegative + return feature >= begin_Connegative and feature <= end_Connegative cdef int is_definite_feature(univ_morph_t feature) nogil: - return feature > begin_Definite and feature < end_Definite + return feature >= begin_Definite and feature <= end_Definite cdef int is_degree_feature(univ_morph_t feature) nogil: - return feature > begin_Degree and feature < end_Degree + return feature >= begin_Degree and feature <= end_Degree cdef int is_derivation_feature(univ_morph_t feature) nogil: - return feature > begin_Derivation and feature < end_Derivation + return feature >= begin_Derivation and feature <= end_Derivation cdef int is_echo_feature(univ_morph_t feature) nogil: - return feature > begin_Echo and feature < end_Echo + return feature >= begin_Echo and feature <= end_Echo cdef int is_foreign_feature(univ_morph_t feature) nogil: - return feature > begin_Foreign and feature < end_Foreign + return feature >= begin_Foreign and feature <= end_Foreign cdef int is_gender_feature(univ_morph_t feature) nogil: - return feature > begin_Gender and feature < end_Gender + return feature >= begin_Gender and feature <= end_Gender cdef int is_hyph_feature(univ_morph_t feature) nogil: - return feature > begin_Hyph and feature < begin_Hyph + return feature >= begin_Hyph and feature <= end_Hyph cdef int is_inf_form_feature(univ_morph_t feature) nogil: - return feature > begin_InfForm and feature < end_InfForm + return feature >= begin_InfForm and feature <= end_InfForm cdef int is_mood_feature(univ_morph_t feature) nogil: - return feature > begin_Mood and feature < end_Mood - -cdef int is_negative_feature(univ_morph_t feature) nogil: - return feature > begin_Negative and feature < end_Negative - -cdef int is_number_feature(univ_morph_t feature) nogil: - return feature > begin_Number and feature < end_Number + return feature >= begin_Mood and feature <= end_Mood cdef int is_name_type_feature(univ_morph_t feature) nogil: - return feature > begin_NameType and feature < end_NameType + return feature >= begin_NameType and feature < end_NameType + +cdef int is_negative_feature(univ_morph_t feature) nogil: + return feature >= begin_Negative and feature <= end_Negative + +cdef int is_noun_type_feature(univ_morph_t feature) nogil: + return feature >= begin_NounType and feature <= end_NounType + +cdef int is_number_feature(univ_morph_t feature) nogil: + return feature >= begin_Number and feature <= end_Number cdef int is_num_form_feature(univ_morph_t feature) nogil: - return feature > begin_NumForm and feature < end_NumForm + return feature >= begin_NumForm and feature <= end_NumForm cdef int is_num_type_feature(univ_morph_t feature) nogil: - return feature > begin_NumType and feature < end_NumType + return feature >= begin_NumType and feature <= end_NumType cdef int is_num_value_feature(univ_morph_t feature) nogil: - return feature > begin_NumValue and feature < end_NumValue + return feature >= begin_NumValue and feature <= end_NumValue cdef int is_part_form_feature(univ_morph_t feature) nogil: - return feature > begin_PartForm and feature < end_PartForm + return feature >= begin_PartForm and feature <= end_PartForm cdef int is_part_type_feature(univ_morph_t feature) nogil: - return feature > begin_PartType and feature < end_PartType + return feature >= begin_PartType and feature <= end_PartType cdef int is_person_feature(univ_morph_t feature) nogil: - return feature > begin_Person and feature < end_Person + return feature >= begin_Person and feature <= end_Person cdef int is_polite_feature(univ_morph_t feature) nogil: - return feature > begin_Polite and feature < end_Polite + return feature >= begin_Polite and feature <= end_Polite cdef int is_polarity_feature(univ_morph_t feature) nogil: - return feature > begin_Polarity and feature < end_Polarity + return feature >= begin_Polarity and feature <= end_Polarity cdef int is_poss_feature(univ_morph_t feature) nogil: - return feature > begin_Poss and feature < end_Poss + return feature >= begin_Poss and feature <= end_Poss cdef int is_prefix_feature(univ_morph_t feature) nogil: - return feature > begin_Prefix and feature < end_Prefix + return feature >= begin_Prefix and feature <= end_Prefix cdef int is_prep_case_feature(univ_morph_t feature) nogil: - return feature > begin_PrepCase and feature < end_PrepCase + return feature >= begin_PrepCase and feature <= end_PrepCase cdef int is_pron_type_feature(univ_morph_t feature) nogil: - return feature > begin_PronType and feature < end_PronType + return feature >= begin_PronType and feature <= end_PronType cdef int is_punct_side_feature(univ_morph_t feature) nogil: - return feature > begin_PunctSide and feature < end_PunctSide + return feature >= begin_PunctSide and feature <= end_PunctSide cdef int is_punct_type_feature(univ_morph_t feature) nogil: - return feature > begin_PunctType and feature < end_PunctType + return feature >= begin_PunctType and feature <= end_PunctType cdef int is_reflex_feature(univ_morph_t feature) nogil: - return feature > begin_Reflex and feature < end_Reflex + return feature >= begin_Reflex and feature <= end_Reflex cdef int is_style_feature(univ_morph_t feature) nogil: - return feature > begin_Style and feature < end_Style + return feature >= begin_Style and feature <= end_Style cdef int is_style_variant_feature(univ_morph_t feature) nogil: - return feature > begin_StyleVariant and feature < end_StyleVariant + return feature >= begin_StyleVariant and feature <= end_StyleVariant cdef int is_tense_feature(univ_morph_t feature) nogil: - return feature > begin_Tense and feature < end_Tense + return feature >= begin_Tense and feature <= end_Tense + +cdef int is_typo_feature(univ_morph_t feature) nogil: + return feature >= begin_Typo and feature <= end_Typo cdef int is_verb_form_feature(univ_morph_t feature) nogil: - return feature > begin_VerbForm and feature < end_VerbForm + return feature >= begin_VerbForm and feature <= end_VerbForm cdef int is_voice_feature(univ_morph_t feature) nogil: - return feature > begin_Voice and feature < end_Voice + return feature >= begin_Voice and feature <= end_Voice cdef int is_verb_type_feature(univ_morph_t feature) nogil: - return feature > begin_VerbType and feature < end_VerbType + return feature >= begin_VerbType and feature <= end_VerbType FIELDS = { @@ -495,9 +597,9 @@ FIELDS = { 'Hyph': 14, 'InfForm': 15, 'Mood': 16, - 'Negative': 17, - 'Number': 18, - 'NameType': 19, + 'NameType': 17, + 'Negative': 18, + 'Number': 19, 'NumForm': 20, 'NumType': 21, 'NumValue': 22, @@ -516,14 +618,15 @@ FIELDS = { 'Style': 35, 'StyleVariant': 36, 'Tense': 37, - 'VerbForm': 38, - 'Voice': 39, - 'VerbType': 40 + 'Typo': 38, + 'VerbForm': 39, + 'Voice': 40, + 'VerbType': 41 } IDS = { "begin_Abbr": begin_Abbr, - "Abbr_yes ": Abbr_yes , + "Abbr_yes": Abbr_yes , "end_Abbr": end_Abbr, "begin_AdpType": begin_AdpType, "AdpType_circ": AdpType_circ, @@ -609,132 +712,6 @@ IDS = { "Degree_com": Degree_com, "Degree_dim": Degree_dim, "end_Degree": end_Degree, - "begin_Gender": begin_Gender, - "Gender_com": Gender_com, - "Gender_fem": Gender_fem, - "Gender_masc": Gender_masc, - "Gender_neut": Gender_neut, - "Gender_dat_masc": Gender_dat_masc, - "Gender_dat_fem": Gender_dat_fem, - "Gender_erg_masc": Gender_erg_masc, - "Gender_erg_fem": Gender_erg_fem, - "Gender_psor_masc": Gender_psor_masc, - "Gender_psor_fem": Gender_psor_fem, - "Gender_psor_neut": Gender_psor_neut, - "end_Gender": end_Gender, - "begin_Mood": begin_Mood, - "Mood_cnd": Mood_cnd, - "Mood_imp": Mood_imp, - "Mood_ind": Mood_ind, - "Mood_n": Mood_n, - "Mood_pot": Mood_pot, - "Mood_sub": Mood_sub, - "Mood_opt": Mood_opt, - "end_Mood": end_Mood, - "begin_Negative": begin_Negative, - "Negative_neg": Negative_neg, - "Negative_pos": Negative_pos, - "Negative_yes": Negative_yes, - "end_Negative": end_Negative, - "begin_Polarity": begin_Polarity, - "Polarity_neg": Polarity_neg, - "Polarity_pos": Polarity_pos, - "end_Polarity": end_Polarity, - "begin_Number": begin_Number, - "Number_com": Number_com, - "Number_dual": Number_dual, - "Number_none": Number_none, - "Number_plur": Number_plur, - "Number_sing": Number_sing, - "Number_ptan": Number_ptan, - "Number_count": Number_count, - "Number_abs_sing": Number_abs_sing, - "Number_abs_plur": Number_abs_plur, - "Number_dat_sing": Number_dat_sing, - "Number_dat_plur": Number_dat_plur, - "Number_erg_sing": Number_erg_sing, - "Number_erg_plur": Number_erg_plur, - "Number_psee_sing": Number_psee_sing, - "Number_psee_plur": Number_psee_plur, - "Number_psor_sing": Number_psor_sing, - "Number_psor_plur": Number_psor_plur, - "end_Number": end_Number, - "begin_NumType": begin_NumType, - "NumType_card": NumType_card, - "NumType_dist": NumType_dist, - "NumType_frac": NumType_frac, - "NumType_gen": NumType_gen, - "NumType_mult": NumType_mult, - "NumType_none": NumType_none, - "NumType_ord": NumType_ord, - "NumType_sets": NumType_sets, - "end_NumType": end_NumType, - "begin_Person": begin_Person, - "Person_one": Person_one, - "Person_two": Person_two, - "Person_three": Person_three, - "Person_none": Person_none, - "Person_abs_one": Person_abs_one, - "Person_abs_two": Person_abs_two, - "Person_abs_three": Person_abs_three, - "Person_dat_one": Person_dat_one, - "Person_dat_two": Person_dat_two, - "Person_dat_three": Person_dat_three, - "Person_erg_one": Person_erg_one, - "Person_erg_two": Person_erg_two, - "Person_erg_three": Person_erg_three, - "Person_psor_one": Person_psor_one, - "Person_psor_two": Person_psor_two, - "Person_psor_three": Person_psor_three, - "end_Person": end_Person, - "begin_Poss": begin_Poss, - "Poss_yes": Poss_yes, - "end_Poss": end_Poss, - "begin_PronType": begin_PronType, - "PronType_advPart": PronType_advPart, - "PronType_art": PronType_art, - "PronType_default": PronType_default, - "PronType_dem": PronType_dem, - "PronType_ind": PronType_ind, - "PronType_int": PronType_int, - "PronType_neg": PronType_neg, - "PronType_prs": PronType_prs, - "PronType_rcp": PronType_rcp, - "PronType_rel": PronType_rel, - "PronType_tot": PronType_tot, - "PronType_clit": PronType_clit, - "PronType_exc": PronType_exc, - "end_PronType": end_PronType, - "begin_Reflex": begin_Reflex, - "Reflex_yes": Reflex_yes, - "end_Reflex": end_Reflex, - "begin_Tense": begin_Tense, - "Tense_fut": Tense_fut, - "Tense_imp": Tense_imp, - "Tense_past": Tense_past, - "Tense_pres": Tense_pres, - "end_Tense": end_Tense, - "begin_VerbForm": begin_VerbForm, - "VerbForm_fin": VerbForm_fin, - "VerbForm_ger": VerbForm_ger, - "VerbForm_inf": VerbForm_inf, - "VerbForm_none": VerbForm_none, - "VerbForm_part": VerbForm_part, - "VerbForm_partFut": VerbForm_partFut, - "VerbForm_partPast": VerbForm_partPast, - "VerbForm_partPres": VerbForm_partPres, - "VerbForm_sup": VerbForm_sup, - "VerbForm_trans": VerbForm_trans, - "VerbForm_conv": VerbForm_conv, - "VerbForm_gdv": VerbForm_gdv, - "end_VerbForm": end_VerbForm, - "begin_Voice": begin_Voice, - "Voice_act": Voice_act, - "Voice_cau": Voice_cau, - "Voice_pass": Voice_pass, - "Voice_mid": Voice_mid, - "Voice_int": Voice_int, - "end_Voice": end_Voice, "begin_Derivation": begin_Derivation, "Derivation_minen": Derivation_minen, "Derivation_sti": Derivation_sti, @@ -756,6 +733,19 @@ IDS = { "Foreign_tscript": Foreign_tscript, "Foreign_yes": Foreign_yes, "end_Foreign": end_Foreign, + "begin_Gender": begin_Gender, + "Gender_com": Gender_com, + "Gender_fem": Gender_fem, + "Gender_masc": Gender_masc, + "Gender_neut": Gender_neut, + "Gender_dat_masc": Gender_dat_masc, + "Gender_dat_fem": Gender_dat_fem, + "Gender_erg_masc": Gender_erg_masc, + "Gender_erg_fem": Gender_erg_fem, + "Gender_psor_masc": Gender_psor_masc, + "Gender_psor_fem": Gender_psor_fem, + "Gender_psor_neut": Gender_psor_neut, + "end_Gender": end_Gender, "begin_Hyph": begin_Hyph, "Hyph_yes": Hyph_yes, "end_Hyph": end_Hyph, @@ -764,6 +754,15 @@ IDS = { "InfForm_two": InfForm_two, "InfForm_three": InfForm_three, "end_InfForm": end_InfForm, + "begin_Mood": begin_Mood, + "Mood_cnd": Mood_cnd, + "Mood_imp": Mood_imp, + "Mood_ind": Mood_ind, + "Mood_n": Mood_n, + "Mood_pot": Mood_pot, + "Mood_sub": Mood_sub, + "Mood_opt": Mood_opt, + "end_Mood": end_Mood, "begin_NameType": begin_NameType, "NameType_geo": NameType_geo, "NameType_prs": NameType_prs, @@ -774,16 +773,50 @@ IDS = { "NameType_pro": NameType_pro, "NameType_oth": NameType_oth, "end_NameType": end_NameType, + "begin_Negative": begin_Negative, + "Negative_neg": Negative_neg, + "Negative_pos": Negative_pos, + "Negative_yes": Negative_yes, + "end_Negative": end_Negative, "begin_NounType": begin_NounType, "NounType_com": NounType_com, "NounType_prop": NounType_prop, "NounType_class": NounType_class, "end_NounType": end_NounType, + "begin_Number": begin_Number, + "Number_com": Number_com, + "Number_dual": Number_dual, + "Number_none": Number_none, + "Number_plur": Number_plur, + "Number_sing": Number_sing, + "Number_ptan": Number_ptan, + "Number_count": Number_count, + "Number_abs_sing": Number_abs_sing, + "Number_abs_plur": Number_abs_plur, + "Number_dat_sing": Number_dat_sing, + "Number_dat_plur": Number_dat_plur, + "Number_erg_sing": Number_erg_sing, + "Number_erg_plur": Number_erg_plur, + "Number_psee_sing": Number_psee_sing, + "Number_psee_plur": Number_psee_plur, + "Number_psor_sing": Number_psor_sing, + "Number_psor_plur": Number_psor_plur, + "end_Number": end_Number, "begin_NumForm": begin_NumForm, "NumForm_digit": NumForm_digit, "NumForm_roman": NumForm_roman, "NumForm_word": NumForm_word, "end_NumForm": end_NumForm, + "begin_NumType": begin_NumType, + "NumType_card": NumType_card, + "NumType_dist": NumType_dist, + "NumType_frac": NumType_frac, + "NumType_gen": NumType_gen, + "NumType_mult": NumType_mult, + "NumType_none": NumType_none, + "NumType_ord": NumType_ord, + "NumType_sets": NumType_sets, + "end_NumType": end_NumType, "begin_NumValue": begin_NumValue, "NumValue_one": NumValue_one, "NumValue_two": NumValue_two, @@ -802,6 +835,29 @@ IDS = { "PartType_inf": PartType_inf, "PartType_vbp": PartType_vbp, "end_PartType": end_PartType, + + "begin_Person": begin_Person, + "Person_one": Person_one, + "Person_two": Person_two, + "Person_three": Person_three, + "Person_none": Person_none, + "Person_abs_one": Person_abs_one, + "Person_abs_two": Person_abs_two, + "Person_abs_three": Person_abs_three, + "Person_dat_one": Person_dat_one, + "Person_dat_two": Person_dat_two, + "Person_dat_three": Person_dat_three, + "Person_erg_one": Person_erg_one, + "Person_erg_two": Person_erg_two, + "Person_erg_three": Person_erg_three, + "Person_psor_one": Person_psor_one, + "Person_psor_two": Person_psor_two, + "Person_psor_three": Person_psor_three, + "end_Person": end_Person, + "begin_Polarity": begin_Polarity, + "Polarity_neg": Polarity_neg, + "Polarity_pos": Polarity_pos, + "end_Polarity": end_Polarity, "begin_Polite": begin_Polite, "Polite_inf": Polite_inf, "Polite_pol": Polite_pol, @@ -812,6 +868,9 @@ IDS = { "Polite_dat_inf": Polite_dat_inf, "Polite_dat_pol": Polite_dat_pol, "end_Polite": end_Polite, + "begin_Poss": begin_Poss, + "Poss_yes": Poss_yes, + "end_Poss": end_Poss, "begin_Prefix": begin_Prefix, "Prefix_yes": Prefix_yes, "end_Prefix": end_Prefix, @@ -819,6 +878,21 @@ IDS = { "PrepCase_npr": PrepCase_npr, "PrepCase_pre": PrepCase_pre, "end_PrepCase": end_PrepCase, + "begin_PronType": begin_PronType, + "PronType_advPart": PronType_advPart, + "PronType_art": PronType_art, + "PronType_default": PronType_default, + "PronType_dem": PronType_dem, + "PronType_ind": PronType_ind, + "PronType_int": PronType_int, + "PronType_neg": PronType_neg, + "PronType_prs": PronType_prs, + "PronType_rcp": PronType_rcp, + "PronType_rel": PronType_rel, + "PronType_tot": PronType_tot, + "PronType_clit": PronType_clit, + "PronType_exc": PronType_exc, + "end_PronType": end_PronType, "begin_PunctSide": begin_PunctSide, "PunctSide_ini": PunctSide_ini, "PunctSide_fin": PunctSide_fin, @@ -834,6 +908,9 @@ IDS = { "PunctType_semi": PunctType_semi, "PunctType_dash": PunctType_dash, "end_PunctType": end_PunctType, + "begin_Reflex": begin_Reflex, + "Reflex_yes": Reflex_yes, + "end_Reflex": end_Reflex, "begin_Style": begin_Style, "Style_arch": Style_arch, "Style_rare": Style_rare, @@ -851,12 +928,42 @@ IDS = { "StyleVariant_styleShort": StyleVariant_styleShort, "StyleVariant_styleBound": StyleVariant_styleBound, "end_StyleVariant": end_StyleVariant, + "begin_Tense": begin_Tense, + "Tense_fut": Tense_fut, + "Tense_imp": Tense_imp, + "Tense_past": Tense_past, + "Tense_pres": Tense_pres, + "end_Tense": end_Tense, + "begin_Typo": begin_Typo, + "Typo_yes": Typo_yes, + "end_Typo": end_Typo, + "begin_VerbForm": begin_VerbForm, + "VerbForm_fin": VerbForm_fin, + "VerbForm_ger": VerbForm_ger, + "VerbForm_inf": VerbForm_inf, + "VerbForm_none": VerbForm_none, + "VerbForm_part": VerbForm_part, + "VerbForm_partFut": VerbForm_partFut, + "VerbForm_partPast": VerbForm_partPast, + "VerbForm_partPres": VerbForm_partPres, + "VerbForm_sup": VerbForm_sup, + "VerbForm_trans": VerbForm_trans, + "VerbForm_conv": VerbForm_conv, + "VerbForm_gdv": VerbForm_gdv, + "end_VerbForm": end_VerbForm, "begin_VerbType": begin_VerbType, "VerbType_aux": VerbType_aux, "VerbType_cop": VerbType_cop, "VerbType_mod": VerbType_mod, "VerbType_light": VerbType_light, "end_VerbType": end_VerbType, + "begin_Voice": begin_Voice, + "Voice_act": Voice_act, + "Voice_cau": Voice_cau, + "Voice_pass": Voice_pass, + "Voice_mid": Voice_mid, + "Voice_int": Voice_int, + "end_Voice": end_Voice, } From 823cc4127ac8191235280406957a1f8694a4c9b5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 26 Sep 2018 21:04:13 +0200 Subject: [PATCH 032/207] Update morphology tests --- spacy/tests/morphology/test_morph_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py index 391fd1337..32cc665af 100644 --- a/spacy/tests/morphology/test_morph_features.py +++ b/spacy/tests/morphology/test_morph_features.py @@ -4,7 +4,7 @@ import pytest from ...morphology import Morphology from ...strings import StringStore from ...lemmatizer import Lemmatizer -from ...symbols import * +from ...morphology import * @pytest.fixture def morphology(): From c8a28413083d2acf13c15c1b22ff3ab3c94918d7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 26 Sep 2018 21:04:29 +0200 Subject: [PATCH 033/207] Add property to get morph key on token --- spacy/tokens/token.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 6da93a726..3af5071d2 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -169,6 +169,10 @@ cdef class Token: return (numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)) + property morph_key: + def __get__(self): + return self.c.morph + property lex_id: """RETURNS (int): Sequential ID of the token's lexical type.""" def __get__(self): From b9ef8ac61657b5c681170f0a8ff1c15ca85b2b71 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 27 Sep 2018 15:14:27 +0200 Subject: [PATCH 034/207] Fix GoldParse class when no entities --- spacy/gold.pyx | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 77c5944ca..cd8d6dab4 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -431,19 +431,18 @@ cdef class GoldParse: if tags is None: tags = [None for _ in words] if heads is None: - heads = [None for token in words] + heads = [None for _ in words] if deps is None: deps = [None for _ in words] - if entities is None: - entities = [None for _ in words] if morphology is None: morphology = [None for _ in words] + if entities is None: + entities = [None for _ in words] elif len(entities) == 0: entities = ['O' for _ in words] elif not isinstance(entities[0], basestring): # Assume we have entities specified by character offset. entities = biluo_tags_from_offsets(doc, entities) - self.mem = Pool() self.loss = 0 self.length = len(doc) From 010f846d5f5e893cd2251c54577e7e7d54827314 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 00:16:51 +0100 Subject: [PATCH 035/207] Fix dependencies in morphologizer --- spacy/_morphologizer.pyx | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/spacy/_morphologizer.pyx b/spacy/_morphologizer.pyx index 10baec8f5..db0a0ff1b 100644 --- a/spacy/_morphologizer.pyx +++ b/spacy/_morphologizer.pyx @@ -1,12 +1,8 @@ from __future__ import unicode_literals from collections import OrderedDict, defaultdict -import cytoolz -import ujson import numpy cimport numpy as np -from .util import msgpack -from .util import msgpack_numpy from thinc.api import chain from thinc.neural.util import to_categorical, copy_array, get_array_module @@ -16,7 +12,7 @@ from ._ml import Tok2Vec, build_morphologizer_model from ._ml import link_vectors_to_models, zero_init, flatten from ._ml import create_default_optimizer from .errors import Errors, TempErrors -from .compat import json_dumps, basestring_ +from .compat import basestring_ from .tokens.doc cimport Doc from .vocab cimport Vocab from .morphology cimport Morphology @@ -58,7 +54,7 @@ class Morphologizer(Pipe): return doc def pipe(self, stream, batch_size=128, n_threads=-1): - for docs in cytoolz.partition_all(batch_size, stream): + for docs in util.minibatch(stream, size=batch_size): docs = list(docs) features, tokvecs = self.predict(docs) self.set_annotations(docs, features, tensors=tokvecs) From ae7c728c5f76d09f77981132f93702ecdfbeab1f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 01:17:19 +0100 Subject: [PATCH 036/207] Fix json dependency --- spacy/morphology.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index ee747cf3c..a4759e4ab 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -3,8 +3,9 @@ from __future__ import unicode_literals from libc.string cimport memset -import ujson as json +import srsly +from .strings import get_string_id from . import symbols from .attrs cimport POS, IS_SPACE from .attrs import LEMMA, intify_attrs @@ -232,7 +233,7 @@ cdef class Morphology: tag_ptr = <RichTagC*>self.tags.get(key) if tag_ptr != NULL: json_tags.append(tag_to_json(tag_ptr[0])) - return json.dumps(json_tags) + return srsly.json_dumps(json_tags) def from_bytes(self, byte_string): raise NotImplementedError From 98dfe5e433bf14b1d5467e293b9cfc0efeac7dee Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 01:31:23 +0100 Subject: [PATCH 037/207] Fix ud_train.py --- spacy/cli/ud/ud_train.py | 116 ++++++++------------------------------- 1 file changed, 24 insertions(+), 92 deletions(-) diff --git a/spacy/cli/ud/ud_train.py b/spacy/cli/ud/ud_train.py index 68fd3b5a9..afef6c073 100644 --- a/spacy/cli/ud/ud_train.py +++ b/spacy/cli/ud/ud_train.py @@ -156,13 +156,8 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0): flat = defaultdict(list) sent_starts = [] for sent in sent_annots: -<<<<<<< HEAD:spacy/cli/ud_train.py - flat['heads'].extend(len(flat['words'])+head for head in sent['heads']) - for field in ['words', 'tags', 'deps', 'morphology', 'entities', 'spaces']: -======= - flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"]) - for field in ["words", "tags", "deps", "entities", "spaces"]: ->>>>>>> develop:spacy/cli/ud/ud_train.py + flat["heads"].extend(len(flat["words"])+head for head in sent["heads"]) + for field in ["words", "tags", "deps", "morphology", "entities", "spaces"]: flat[field].extend(sent[field]) sent_starts.append(True) sent_starts.extend([False] * (len(sent["words"]) - 1)) @@ -260,55 +255,30 @@ def write_conllu(docs, file_): def print_progress(itn, losses, ud_scores): fields = { -<<<<<<< HEAD:spacy/cli/ud_train.py - 'dep_loss': losses.get('parser', 0.0), - 'morph_loss': losses.get('morphologizer', 0.0), - 'tag_loss': losses.get('tagger', 0.0), - 'words': ud_scores['Words'].f1 * 100, - 'sents': ud_scores['Sentences'].f1 * 100, - 'tags': ud_scores['XPOS'].f1 * 100, - 'uas': ud_scores['UAS'].f1 * 100, - 'las': ud_scores['LAS'].f1 * 100, - 'morph': ud_scores['Feats'].f1 * 100, - } - header = ['Epoch', 'P.Loss', 'M.Loss', 'LAS', 'UAS', 'TAG', 'MORPH', 'SENT', 'WORD'] - if itn == 0: - print('\t'.join(header)) - tpl = '\t'.join(( - '{:d}', - '{dep_loss:.1f}', - '{morph_loss:.1f}', - '{las:.1f}', - '{uas:.1f}', - '{tags:.1f}', - '{morph:.1f}', - '{sents:.1f}', - '{words:.1f}', - )) -======= "dep_loss": losses.get("parser", 0.0), + "morph_loss": losses.get("morphologizer", 0.0), "tag_loss": losses.get("tagger", 0.0), "words": ud_scores["Words"].f1 * 100, "sents": ud_scores["Sentences"].f1 * 100, "tags": ud_scores["XPOS"].f1 * 100, "uas": ud_scores["UAS"].f1 * 100, "las": ud_scores["LAS"].f1 * 100, + "morph": ud_scores["Feats"].f1 * 100, } - header = ["Epoch", "Loss", "LAS", "UAS", "TAG", "SENT", "WORD"] + header = ["Epoch", "P.Loss", "M.Loss", "LAS", "UAS", "TAG", "MORPH", "SENT", "WORD"] if itn == 0: print("\t".join(header)) - tpl = "\t".join( - ( - "{:d}", - "{dep_loss:.1f}", - "{las:.1f}", - "{uas:.1f}", - "{tags:.1f}", - "{sents:.1f}", - "{words:.1f}", - ) - ) ->>>>>>> develop:spacy/cli/ud/ud_train.py + tpl = "\t".join(( + "{:d}", + "{dep_loss:.1f}", + "{morph_loss:.1f}", + "{las:.1f}", + "{uas:.1f}", + "{tags:.1f}", + "{morph:.1f}", + "{sents:.1f}", + "{words:.1f}", + )) print(tpl.format(itn, **fields)) @@ -329,48 +299,26 @@ def get_token_conllu(token, i): head = 0 else: head = i + (token.head.i - token.i) + 1 -<<<<<<< HEAD:spacy/cli/ud_train.py features = token.vocab.morphology.get(token.morph_key) feat_str = [] - replacements = {'one': '1', 'two': '2', 'three': '3'} + replacements = {"one": "1", "two": "2", "three": "3"} for feat in features: - if not feat.startswith('begin') and not feat.startswith('end'): - key, value = feat.split('_') + if not feat.startswith("begin") and not feat.startswith("end"): + key, value = feat.split("_") value = replacements.get(value, value) - feat_str.append('%s=%s' % (key, value.title())) + feat_str.append("%s=%s" % (key, value.title())) if not feat_str: - feat_str = '_' + feat_str = "_" else: - feat_str = '|'.join(feat_str) + feat_str = "|".join(feat_str) fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, feat_str, - str(head), token.dep_.lower(), '_', '_'] - lines.append('\t'.join(fields)) - return '\n'.join(lines) - -Token.set_extension('get_conllu_lines', method=get_token_conllu) -Token.set_extension('begins_fused', default=False) -Token.set_extension('inside_fused', default=False) -======= - fields = [ - str(i + 1), - token.text, - token.lemma_, - token.pos_, - token.tag_, - "_", - str(head), - token.dep_.lower(), - "_", - "_", - ] + str(head), token.dep_.lower(), "_", "_"] lines.append("\t".join(fields)) return "\n".join(lines) - Token.set_extension("get_conllu_lines", method=get_token_conllu) Token.set_extension("begins_fused", default=False) Token.set_extension("inside_fused", default=False) ->>>>>>> develop:spacy/cli/ud/ud_train.py ################## @@ -394,14 +342,9 @@ def load_nlp(corpus, config, vectors=None): def initialize_pipeline(nlp, docs, golds, config, device): -<<<<<<< HEAD:spacy/cli/ud_train.py - nlp.add_pipe(nlp.create_pipe('tagger')) - nlp.add_pipe(nlp.create_pipe('morphologizer')) - nlp.add_pipe(nlp.create_pipe('parser')) -======= nlp.add_pipe(nlp.create_pipe("tagger")) + nlp.add_pipe(nlp.create_pipe("morphologizer")) nlp.add_pipe(nlp.create_pipe("parser")) ->>>>>>> develop:spacy/cli/ud/ud_train.py if config.multitask_tag: nlp.parser.add_multitask_objective("tag") if config.multitask_sent: @@ -597,23 +540,12 @@ def main( out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i) with nlp.use_params(optimizer.averages): if use_oracle_segments: -<<<<<<< HEAD:spacy/cli/ud_train.py parsed_docs, scores = evaluate(nlp, paths.dev.conllu, paths.dev.conllu, out_path) else: parsed_docs, scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path) print_progress(i, losses, scores) -======= - parsed_docs, scores = evaluate( - nlp, paths.dev.conllu, paths.dev.conllu, out_path - ) - else: - parsed_docs, scores = evaluate( - nlp, paths.dev.text, paths.dev.conllu, out_path - ) - print_progress(i, losses, scores) ->>>>>>> develop:spacy/cli/ud/ud_train.py def _render_parses(i, to_render): From bfa52d9d8a940f89f5abc6eaebd3cf380f67d199 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 01:34:32 +0100 Subject: [PATCH 038/207] Move morphologizer within spacy/pipes --- spacy/{_morphologizer.pyx => pipeline/morphologizer.pyx} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename spacy/{_morphologizer.pyx => pipeline/morphologizer.pyx} (100%) diff --git a/spacy/_morphologizer.pyx b/spacy/pipeline/morphologizer.pyx similarity index 100% rename from spacy/_morphologizer.pyx rename to spacy/pipeline/morphologizer.pyx From fc1cc4c529e08c887287a3f449fc181fee9a8b6d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 01:36:04 +0100 Subject: [PATCH 039/207] Move morphologizer under spacy/pipes --- setup.py | 2 +- spacy/language.py | 2 +- spacy/pipeline/__init__.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index ea98f0b54..47c82b014 100755 --- a/setup.py +++ b/setup.py @@ -41,8 +41,8 @@ MOD_NAMES = [ "spacy.vocab", "spacy.attrs", "spacy.morphology", - "spacy._morphologizer", "spacy.pipeline.pipes", + "spacy.pipelines.morphologizer", "spacy.syntax.stateclass", "spacy.syntax._state", "spacy.tokenizer", diff --git a/spacy/language.py b/spacy/language.py index ed6dc64dc..b90fa8486 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -18,7 +18,7 @@ from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens from .pipeline import EntityRuler -from ._morphologizer import Morphologizer +from .pipeline import Morphologizer from .compat import izip, basestring_ from .gold import GoldParse from .scorer import Scorer diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index d683cc989..36b9b8d46 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .pipes import Tagger, DependencyParser, EntityRecognizer # noqa from .pipes import TextCategorizer, Tensorizer, Pipe # noqa +from .morphologizer import Morphologizer from .entityruler import EntityRuler # noqa from .hooks import SentenceSegmenter, SimilarityHook # noqa from .functions import merge_entities, merge_noun_chunks, merge_subtokens # noqa From 21008ad2d8cd82a72b08e2f9e3d3a75eb32360b5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 10:45:24 +0100 Subject: [PATCH 040/207] Draft API for morphological analysis class --- spacy/tokens/morphanalysis.pyx | 57 ++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 spacy/tokens/morphanalysis.pyx diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx new file mode 100644 index 000000000..df2d6ec20 --- /dev/null +++ b/spacy/tokens/morphanalysis.pyx @@ -0,0 +1,57 @@ +cdef class Morphanalysis: + """Control access to morphological features for a token.""" + def __init__(self, Vocab vocab, features=None): + pass + + @classmethod + def from_id(self, Vocab vocab, hash_t key): + pass + + def __contains__(self, feature): + pass + + def __iter__(self): + pass + + def __len__(self): + pass + + def __str__(self): + pass + + def __repr__(self): + pass + + def __hash__(self): + pass + + @property + def is_base_form(self): + pass + + @property + def pos(self): + pass + + @property + def pos_(self): + pass + + @property + def id(self): + pass + + def get(self, name): + pass + + def set(self, name, value): + pass + + def add(self, feature): + pass + + def remove(self, feature): + pass + + def to_json(self): + pass From ef3110a44478d1364b3906d119dfe596c406df6a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 10:45:55 +0100 Subject: [PATCH 041/207] Fix compile error --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 47c82b014..f193d0498 100755 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ MOD_NAMES = [ "spacy.attrs", "spacy.morphology", "spacy.pipeline.pipes", - "spacy.pipelines.morphologizer", + "spacy.pipeline.morphologizer", "spacy.syntax.stateclass", "spacy.syntax._state", "spacy.tokenizer", From 88059664609c332849f674ba4eb4c8554c3c115c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 10:46:27 +0100 Subject: [PATCH 042/207] Fix moved Morphologizer class --- spacy/pipeline/morphologizer.pyx | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index db0a0ff1b..820567e71 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -6,18 +6,17 @@ cimport numpy as np from thinc.api import chain from thinc.neural.util import to_categorical, copy_array, get_array_module -from . import util -from .pipeline import Pipe -from ._ml import Tok2Vec, build_morphologizer_model -from ._ml import link_vectors_to_models, zero_init, flatten -from ._ml import create_default_optimizer -from .errors import Errors, TempErrors -from .compat import basestring_ -from .tokens.doc cimport Doc -from .vocab cimport Vocab -from .morphology cimport Morphology -from .morphology import parse_feature, IDS, FIELDS, FIELD_SIZES, NAMES -from .pipeline import Pipe +from .. import util +from .pipes import Pipe +from .._ml import Tok2Vec, build_morphologizer_model +from .._ml import link_vectors_to_models, zero_init, flatten +from .._ml import create_default_optimizer +from ..errors import Errors, TempErrors +from ..compat import basestring_ +from ..tokens.doc cimport Doc +from ..vocab cimport Vocab +from ..morphology cimport Morphology +from ..morphology import parse_feature, IDS, FIELDS, FIELD_SIZES, NAMES class Morphologizer(Pipe): From 34651c8ddf06b3087151167bd269d21b6b546225 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 12:13:47 +0100 Subject: [PATCH 043/207] Fix lemmatizer --- spacy/lemmatizer.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 69f5c8d20..c708800e9 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -48,6 +48,11 @@ class Lemmatizer(object): avoid lemmatization entirely. """ morphology = {} if morphology is None else morphology + morphology = dict(morphology) + for key, value in list(morphology.items()): + if value is True: + feat, value = key.split('_') + morphology[feat] = value others = [ key for key in morphology @@ -68,13 +73,13 @@ class Lemmatizer(object): return True elif univ_pos == "adj" and morphology.get("Degree") == "pos": return True - elif VerbForm_inf in morphology or 'VerbForm_inf' in morphology: + elif morphology.get('VerbForm') == 'inf': return True - elif VerbForm_none in morphology or 'VerbForm_none' in morphology: + elif morphology.get('VerbForm') == 'none': return True - elif Number_sing in morphology or 'Number_sing' in morphology: + elif morphology.get('VerbForm') == 'inf': return True - elif Degree_pos in morphology or 'Degree_pos' in morphology: + elif morphology.get('Degree') == 'pos': return True else: return False From be5235369cc23ad6838c21f0980c90cc20dc4f00 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 12:14:23 +0100 Subject: [PATCH 044/207] Space out symbols enum, to make maintaining easier --- spacy/attrs.pxd | 13 +++++++------ spacy/symbols.pxd | 19 +++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index 79a177ba9..a70fae04b 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -1,7 +1,9 @@ +from . cimport symbols + # Reserve 64 values for flag features cdef enum attr_id_t: - NULL_ATTR - IS_ALPHA + NULL_ATTR = 0 + IS_ALPHA = symbols.IS_ALPHA IS_ASCII IS_DIGIT IS_LOWER @@ -20,7 +22,7 @@ cdef enum attr_id_t: IS_RIGHT_PUNCT IS_CURRENCY - FLAG19 = 19 + FLAG19 = symbols.FLAG19 FLAG20 FLAG21 FLAG22 @@ -66,7 +68,7 @@ cdef enum attr_id_t: FLAG62 FLAG63 - ID + ID = symbols.ID ORTH LOWER NORM @@ -74,7 +76,7 @@ cdef enum attr_id_t: PREFIX SUFFIX - LENGTH + LENGTH = symbols.LENGTH CLUSTER LEMMA POS @@ -86,5 +88,4 @@ cdef enum attr_id_t: SENT_START SPACY PROB - LANG diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index 051b92edb..1cd1f7ef7 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -19,7 +19,7 @@ cdef enum symbol_t: IS_RIGHT_PUNCT IS_CURRENCY - FLAG19 = 19 + FLAG19 = 1000 FLAG20 FLAG21 FLAG22 @@ -65,7 +65,7 @@ cdef enum symbol_t: FLAG62 FLAG63 - ID + ID = 2000 ORTH LOWER NORM @@ -73,7 +73,7 @@ cdef enum symbol_t: PREFIX SUFFIX - LENGTH + LENGTH = 3000 CLUSTER LEMMA POS @@ -87,7 +87,7 @@ cdef enum symbol_t: PROB LANG - ADJ + ADJ = 4000 ADP ADV AUX @@ -108,7 +108,7 @@ cdef enum symbol_t: EOL SPACE - Animacy_anim + Animacy_anim = 5000 Animacy_inan Animacy_hum # U20 Animacy_nhum @@ -385,7 +385,7 @@ cdef enum symbol_t: VerbType_mod # U VerbType_light # U - PERSON + PERSON = 6000 NORP FACILITY ORG @@ -397,7 +397,7 @@ cdef enum symbol_t: LANGUAGE LAW - DATE + DATE = 7000 TIME PERCENT MONEY @@ -405,7 +405,8 @@ cdef enum symbol_t: ORDINAL CARDINAL - acomp + acl = 8000 + acomp advcl advmod agent @@ -458,5 +459,3 @@ cdef enum symbol_t: rcmod root xcomp - - acl From 6734cfec8881f280aeaf952a31feecac48a8b93d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 12:14:37 +0100 Subject: [PATCH 045/207] Add comment --- spacy/morphology.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index a4759e4ab..585a004b4 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -204,6 +204,7 @@ cdef class Morphology: pos = 0 cdef attr_t lemma = <attr_t>self._cache.get(tag_id, token.lex.orth) if lemma == 0: + # Ugh, self.lemmatize has opposite arg order from self.lemmatizer :( lemma = self.lemmatize(pos, token.lex.orth, features) self._cache.set(tag_id, token.lex.orth, <void*>lemma) token.lemma = lemma From d0ca64bb07476894b35d2881c39bfa6f7a555bee Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 12:14:53 +0100 Subject: [PATCH 046/207] Fix imports in morphanalysis --- spacy/tokens/morphanalysis.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index df2d6ec20..09ab04d89 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -1,3 +1,6 @@ +from ..vocab cimport Vocab +from ..typedefs cimport hash_t + cdef class Morphanalysis: """Control access to morphological features for a token.""" def __init__(self, Vocab vocab, features=None): From bcfe3bd3122a61147d31425566060da90e997115 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 12:51:11 +0100 Subject: [PATCH 047/207] Fix StringStore after symbols changes --- spacy/strings.pyx | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 64954503f..0565b2a0a 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -11,11 +11,15 @@ import srsly from .compat import basestring_ from .symbols import IDS as SYMBOLS_BY_STR -from .symbols import NAMES as SYMBOLS_BY_INT +from . import symbols from .typedefs cimport hash_t from .errors import Errors from . import util +SYMBOLS_BY_INT = {} +for name in symbols.NAMES: + SYMBOLS_BY_INT[SYMBOLS_BY_STR[name]] = name +print(SYMBOLS_BY_INT[6005]) def get_string_id(key): """Get a string ID, handling the reserved symbols correctly. If the key is @@ -116,6 +120,8 @@ cdef class StringStore: return u'' elif string_or_id in SYMBOLS_BY_STR: return SYMBOLS_BY_STR[string_or_id] + elif string_or_id in SYMBOLS_BY_INT: + return SYMBOLS_BY_INT[string_or_id] cdef hash_t key if isinstance(string_or_id, unicode): key = hash_string(string_or_id) @@ -123,8 +129,6 @@ cdef class StringStore: elif isinstance(string_or_id, bytes): key = hash_utf8(string_or_id, len(string_or_id)) return key - elif string_or_id < len(SYMBOLS_BY_INT): - return SYMBOLS_BY_INT[string_or_id] else: key = string_or_id self.hits.insert(key) @@ -181,11 +185,14 @@ cdef class StringStore: string (unicode): The string to check. RETURNS (bool): Whether the store contains the string. """ + global SYMBOLS_BY_INT cdef hash_t key if isinstance(string, int) or isinstance(string, long): if string == 0: return True key = string + if key in SYMBOLS_BY_INT: + return True elif len(string) == 0: return True elif string in SYMBOLS_BY_STR: @@ -195,11 +202,8 @@ cdef class StringStore: else: string = string.encode('utf8') key = hash_utf8(string, len(string)) - if key < len(SYMBOLS_BY_INT): - return True - else: - self.hits.insert(key) - return self._map.get(key) is not NULL + self.hits.insert(key) + return self._map.get(key) is not NULL def __iter__(self): """Iterate over the strings in the store, in order. From c773b5011c9e1cbe7cc8b8f7e93bb73c56ba0266 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 12:52:15 +0100 Subject: [PATCH 048/207] Revert "Fix StringStore after symbols changes" This reverts commit bcfe3bd3122a61147d31425566060da90e997115. --- spacy/strings.pyx | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 0565b2a0a..64954503f 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -11,15 +11,11 @@ import srsly from .compat import basestring_ from .symbols import IDS as SYMBOLS_BY_STR -from . import symbols +from .symbols import NAMES as SYMBOLS_BY_INT from .typedefs cimport hash_t from .errors import Errors from . import util -SYMBOLS_BY_INT = {} -for name in symbols.NAMES: - SYMBOLS_BY_INT[SYMBOLS_BY_STR[name]] = name -print(SYMBOLS_BY_INT[6005]) def get_string_id(key): """Get a string ID, handling the reserved symbols correctly. If the key is @@ -120,8 +116,6 @@ cdef class StringStore: return u'' elif string_or_id in SYMBOLS_BY_STR: return SYMBOLS_BY_STR[string_or_id] - elif string_or_id in SYMBOLS_BY_INT: - return SYMBOLS_BY_INT[string_or_id] cdef hash_t key if isinstance(string_or_id, unicode): key = hash_string(string_or_id) @@ -129,6 +123,8 @@ cdef class StringStore: elif isinstance(string_or_id, bytes): key = hash_utf8(string_or_id, len(string_or_id)) return key + elif string_or_id < len(SYMBOLS_BY_INT): + return SYMBOLS_BY_INT[string_or_id] else: key = string_or_id self.hits.insert(key) @@ -185,14 +181,11 @@ cdef class StringStore: string (unicode): The string to check. RETURNS (bool): Whether the store contains the string. """ - global SYMBOLS_BY_INT cdef hash_t key if isinstance(string, int) or isinstance(string, long): if string == 0: return True key = string - if key in SYMBOLS_BY_INT: - return True elif len(string) == 0: return True elif string in SYMBOLS_BY_STR: @@ -202,8 +195,11 @@ cdef class StringStore: else: string = string.encode('utf8') key = hash_utf8(string, len(string)) - self.hits.insert(key) - return self._map.get(key) is not NULL + if key < len(SYMBOLS_BY_INT): + return True + else: + self.hits.insert(key) + return self._map.get(key) is not NULL def __iter__(self): """Iterate over the strings in the store, in order. From 74db1d9602c13feffbf1fd4fd03ecf6297e973f7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 12:52:30 +0100 Subject: [PATCH 049/207] Revert "Space out symbols enum, to make maintaining easier" This reverts commit be5235369cc23ad6838c21f0980c90cc20dc4f00. --- spacy/attrs.pxd | 13 ++++++------- spacy/symbols.pxd | 19 ++++++++++--------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index a70fae04b..79a177ba9 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -1,9 +1,7 @@ -from . cimport symbols - # Reserve 64 values for flag features cdef enum attr_id_t: - NULL_ATTR = 0 - IS_ALPHA = symbols.IS_ALPHA + NULL_ATTR + IS_ALPHA IS_ASCII IS_DIGIT IS_LOWER @@ -22,7 +20,7 @@ cdef enum attr_id_t: IS_RIGHT_PUNCT IS_CURRENCY - FLAG19 = symbols.FLAG19 + FLAG19 = 19 FLAG20 FLAG21 FLAG22 @@ -68,7 +66,7 @@ cdef enum attr_id_t: FLAG62 FLAG63 - ID = symbols.ID + ID ORTH LOWER NORM @@ -76,7 +74,7 @@ cdef enum attr_id_t: PREFIX SUFFIX - LENGTH = symbols.LENGTH + LENGTH CLUSTER LEMMA POS @@ -88,4 +86,5 @@ cdef enum attr_id_t: SENT_START SPACY PROB + LANG diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index 1cd1f7ef7..051b92edb 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -19,7 +19,7 @@ cdef enum symbol_t: IS_RIGHT_PUNCT IS_CURRENCY - FLAG19 = 1000 + FLAG19 = 19 FLAG20 FLAG21 FLAG22 @@ -65,7 +65,7 @@ cdef enum symbol_t: FLAG62 FLAG63 - ID = 2000 + ID ORTH LOWER NORM @@ -73,7 +73,7 @@ cdef enum symbol_t: PREFIX SUFFIX - LENGTH = 3000 + LENGTH CLUSTER LEMMA POS @@ -87,7 +87,7 @@ cdef enum symbol_t: PROB LANG - ADJ = 4000 + ADJ ADP ADV AUX @@ -108,7 +108,7 @@ cdef enum symbol_t: EOL SPACE - Animacy_anim = 5000 + Animacy_anim Animacy_inan Animacy_hum # U20 Animacy_nhum @@ -385,7 +385,7 @@ cdef enum symbol_t: VerbType_mod # U VerbType_light # U - PERSON = 6000 + PERSON NORP FACILITY ORG @@ -397,7 +397,7 @@ cdef enum symbol_t: LANGUAGE LAW - DATE = 7000 + DATE TIME PERCENT MONEY @@ -405,8 +405,7 @@ cdef enum symbol_t: ORDINAL CARDINAL - acl = 8000 - acomp + acomp advcl advmod agent @@ -459,3 +458,5 @@ cdef enum symbol_t: rcmod root xcomp + + acl From b69013e2d7805d3c870f099fa32046e0d2dbe994 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 13:11:38 +0100 Subject: [PATCH 050/207] Fix passing of morphological features to lemmatizer --- spacy/lemmatizer.py | 12 ------------ spacy/morphology.pyx | 11 ++++++++++- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index c708800e9..99f157e05 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -47,17 +47,6 @@ class Lemmatizer(object): Check whether we're dealing with an uninflected paradigm, so we can avoid lemmatization entirely. """ - morphology = {} if morphology is None else morphology - morphology = dict(morphology) - for key, value in list(morphology.items()): - if value is True: - feat, value = key.split('_') - morphology[feat] = value - others = [ - key - for key in morphology - if key not in (POS, "Number", "POS", "VerbForm", "Tense") - ] if univ_pos == "noun" and morphology.get("Number") == "sing": return True elif univ_pos == "verb" and morphology.get("VerbForm") == "inf": @@ -68,7 +57,6 @@ class Lemmatizer(object): morphology.get("VerbForm") == "fin" and morphology.get("Tense") == "pres" and morphology.get("Number") is None - and not others ): return True elif univ_pos == "adj" and morphology.get("Degree") == "pos": diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 585a004b4..40c7f66af 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -141,7 +141,16 @@ cdef class Morphology: return self.strings.add(py_string.lower()) cdef list lemma_strings cdef unicode lemma_string - lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) + # Normalize features into a dict keyed by the field, to make life easier + # for the lemmatizer. Handles string-to-int conversion too. + string_feats = {} + for key, value in morphology.items(): + if value is True: + name, value = self.strings.as_string(key).split('_', 1) + string_feats[name] = value + else: + string_feats[self.strings.as_string(key)] = self.strings.as_string(value) + lemma_strings = self.lemmatizer(py_string, univ_pos, string_feats) lemma_string = lemma_strings[0] lemma = self.strings.add(lemma_string) return lemma From b9ade7d4e090a2bd20626aa96ffa310031871c23 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 14:03:07 +0100 Subject: [PATCH 051/207] Add MorphAnalysisC struct --- spacy/morphology.pxd | 51 +---- spacy/morphology.pyx | 26 +-- spacy/structs.pxd | 46 ++++ spacy/tokens/morphanalysis.pyx | 371 +++++++++++++++++++++++++++++++-- 4 files changed, 420 insertions(+), 74 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index adc5e5574..24e54bdee 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -3,7 +3,7 @@ from preshed.maps cimport PreshMap, PreshMapArray from libc.stdint cimport uint64_t from murmurhash cimport mrmr -from .structs cimport TokenC +from .structs cimport TokenC, MorphAnalysisC from .strings cimport StringStore from .typedefs cimport hash_t, attr_t, flags_t from .parts_of_speech cimport univ_pos_t @@ -24,7 +24,7 @@ cdef class Morphology: cdef readonly int n_tags cpdef update(self, hash_t morph, features) - cdef hash_t insert(self, RichTagC tag) except 0 + cdef hash_t insert(self, MorphAnalysisC tag) except 0 cdef int assign_untagged(self, TokenC* token) except -1 cdef int assign_tag(self, TokenC* token, tag) except -1 @@ -416,50 +416,3 @@ cdef enum univ_morph_t: Voice_int # hb end_Voice - -cdef struct RichTagC: - univ_pos_t pos - - univ_morph_t abbr - univ_morph_t adp_type - univ_morph_t adv_type - univ_morph_t animacy - univ_morph_t aspect - univ_morph_t case - univ_morph_t conj_type - univ_morph_t connegative - univ_morph_t definite - univ_morph_t degree - univ_morph_t derivation - univ_morph_t echo - univ_morph_t foreign - univ_morph_t gender - univ_morph_t hyph - univ_morph_t inf_form - univ_morph_t mood - univ_morph_t negative - univ_morph_t number - univ_morph_t name_type - univ_morph_t noun_type - univ_morph_t num_form - univ_morph_t num_type - univ_morph_t num_value - univ_morph_t part_form - univ_morph_t part_type - univ_morph_t person - univ_morph_t polite - univ_morph_t polarity - univ_morph_t poss - univ_morph_t prefix - univ_morph_t prep_case - univ_morph_t pron_type - univ_morph_t punct_side - univ_morph_t punct_type - univ_morph_t reflex - univ_morph_t style - univ_morph_t style_variant - univ_morph_t tense - univ_morph_t typo - univ_morph_t verb_form - univ_morph_t voice - univ_morph_t verb_type diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 40c7f66af..52acfedfb 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -111,13 +111,13 @@ cdef class Morphology: print(list(NAMES.keys())[:10]) print(NAMES.get(feature-1), NAMES.get(feature+1)) raise KeyError("Unknown feature: %d" % feature) - cdef RichTagC tag + cdef MorphAnalysisC tag tag = create_rich_tag(features) cdef hash_t key = self.insert(tag) return key def get(self, hash_t morph): - tag = <RichTagC*>self.tags.get(morph) + tag = <MorphAnalysisC*>self.tags.get(morph) if tag == NULL: return [] else: @@ -125,7 +125,7 @@ cdef class Morphology: cpdef update(self, hash_t morph, features): """Update a morphological analysis with new feature values.""" - tag = (<RichTagC*>self.tags.get(morph))[0] + tag = (<MorphAnalysisC*>self.tags.get(morph))[0] features = intify_features(features) cdef univ_morph_t feature for feature in features: @@ -168,10 +168,10 @@ cdef class Morphology: attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) self.exc[(tag_str, self.strings.add(orth_str))] = attrs - cdef hash_t insert(self, RichTagC tag) except 0: + cdef hash_t insert(self, MorphAnalysisC tag) except 0: cdef hash_t key = hash_tag(tag) if self.tags.get(key) == NULL: - tag_ptr = <RichTagC*>self.mem.alloc(1, sizeof(RichTagC)) + tag_ptr = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC)) tag_ptr[0] = tag self.tags.set(key, <void*>tag_ptr) return key @@ -240,7 +240,7 @@ cdef class Morphology: def to_bytes(self): json_tags = [] for key in self.tags: - tag_ptr = <RichTagC*>self.tags.get(key) + tag_ptr = <MorphAnalysisC*>self.tags.get(key) if tag_ptr != NULL: json_tags.append(tag_to_json(tag_ptr[0])) return srsly.json_dumps(json_tags) @@ -261,18 +261,18 @@ cpdef univ_pos_t get_int_tag(pos_): cpdef intify_features(features): return {IDS.get(feature, feature) for feature in features} -cdef hash_t hash_tag(RichTagC tag) nogil: +cdef hash_t hash_tag(MorphAnalysisC tag) nogil: return mrmr.hash64(&tag, sizeof(tag), 0) -cdef RichTagC create_rich_tag(features) except *: - cdef RichTagC tag +cdef MorphAnalysisC create_rich_tag(features) except *: + cdef MorphAnalysisC tag cdef univ_morph_t feature memset(&tag, 0, sizeof(tag)) for feature in features: set_feature(&tag, feature, 1) return tag -cdef tag_to_json(RichTagC tag): +cdef tag_to_json(MorphAnalysisC tag): features = [] if tag.abbr != 0: features.append(NAMES[tag.abbr]) @@ -360,11 +360,11 @@ cdef tag_to_json(RichTagC tag): features.append(NAMES[tag.verb_type]) return features -cdef RichTagC tag_from_json(json_tag): - cdef RichTagC tag +cdef MorphAnalysisC tag_from_json(json_tag): + cdef MorphAnalysisC tag return tag -cdef int set_feature(RichTagC* tag, univ_morph_t feature, int value) except -1: +cdef int set_feature(MorphAnalysisC* tag, univ_morph_t feature, int value) except -1: if value == True: value_ = feature else: diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 9f7904919..7452123c0 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -74,4 +74,50 @@ cdef struct TokenC: hash_t ent_id +cdef struct MorphAnalysisC: + univ_pos_t pos + + attr_t abbr + attr_t adp_type + attr_t adv_type + attr_t animacy + attr_t aspect + attr_t case + attr_t conj_type + attr_t connegative + attr_t definite + attr_t degree + attr_t derivation + attr_t echo + attr_t foreign + attr_t gender + attr_t hyph + attr_t inf_form + attr_t mood + attr_t negative + attr_t number + attr_t name_type + attr_t noun_type + attr_t num_form + attr_t num_type + attr_t num_value + attr_t part_form + attr_t part_type + attr_t person + attr_t polite + attr_t polarity + attr_t poss + attr_t prefix + attr_t prep_case + attr_t pron_type + attr_t punct_side + attr_t punct_type + attr_t reflex + attr_t style + attr_t style_variant + attr_t tense + attr_t typo + attr_t verb_form + attr_t voice + attr_t verb_type diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index 09ab04d89..722f97994 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -1,10 +1,14 @@ from ..vocab cimport Vocab from ..typedefs cimport hash_t + cdef class Morphanalysis: """Control access to morphological features for a token.""" - def __init__(self, Vocab vocab, features=None): - pass + def __init__(self, Vocab vocab, features=tuple()): + self.vocab = vocab + self.key = self.vocab.morphology.add(features) + analysis = <const MorphAnalysisC*>self.vocab.morphology.tags.get(self.key) + self.c = analysis[0] @classmethod def from_id(self, Vocab vocab, hash_t key): @@ -28,6 +32,12 @@ cdef class Morphanalysis: def __hash__(self): pass + def get(self, name): + pass + + def to_json(self): + pass + @property def is_base_form(self): pass @@ -44,17 +54,354 @@ cdef class Morphanalysis: def id(self): pass - def get(self, name): - pass + property abbr: + def __get__(self): + pass - def set(self, name, value): - pass + property adp_type: + def __get__(self): + pass - def add(self, feature): - pass + property adv_type: + def __get__(self): + pass - def remove(self, feature): - pass + property animacy: + def __get__(self): + pass - def to_json(self): - pass + property aspect: + def __get__(self): + pass + + property case: + def __get__(self): + pass + + property conj_type: + def __get__(self): + pass + + property connegative: + def __get__(self): + pass + + property definite: + def __get__(self): + pass + + property degree: + def __get__(self): + pass + + property derivation: + def __get__(self): + pass + + property echo: + def __get__(self): + pass + + property foreign: + def __get__(self): + pass + + property gender: + def __get__(self): + pass + + property hyph: + def __get__(self): + pass + + property inf_form: + def __get__(self): + pass + + property name_type: + def __get__(self): + pass + + property negative: + def __get__(self): + pass + + property mood: + def __get__(self): + pass + + property name_type: + def __get__(self): + pass + + property negative: + def __get__(self): + pass + + property number: + def __get__(self): + pass + + property num_form: + def __get__(self): + pass + + property num_type: + def __get__(self): + pass + + property num_value: + def __get__(self): + pass + + property part_form: + def __get__(self): + pass + + property part_type: + def __get__(self): + pass + + property person: + def __get__(self): + pass + + property polite: + def __get__(self): + pass + + property polarity: + def __get__(self): + pass + + property poss: + def __get__(self): + pass + + property prefix: + def __get__(self): + pass + + property prep_case: + def __get__(self): + pass + + property pron_type: + def __get__(self): + pass + + property punct_side: + def __get__(self): + pass + + property punct_type: + def __get__(self): + pass + + property reflex: + def __get__(self): + pass + + property style: + def __get__(self): + pass + + property style_variant: + def __get__(self): + pass + + property tense: + def __get__(self): + pass + + property typo: + def __get__(self): + pass + + property verb_form: + def __get__(self): + pass + + property voice: + def __get__(self): + pass + + property verb_type: + def __get__(self): + pass + + property abbr_: + def __get__(self): + pass + + property adp_type_: + def __get__(self): + pass + + property adv_type_: + def __get__(self): + pass + + property animacy_: + def __get__(self): + pass + + property aspect_: + def __get__(self): + pass + + property case_: + def __get__(self): + pass + + property conj_type_: + def __get__(self): + pass + + property connegative_: + def __get__(self): + pass + + property definite_: + def __get__(self): + pass + + property degree_: + def __get__(self): + pass + + property derivation_: + def __get__(self): + pass + + property echo_: + def __get__(self): + pass + + property foreign_: + def __get__(self): + pass + + property gender_: + def __get__(self): + pass + + property hyph_: + def __get__(self): + pass + + property inf_form_: + def __get__(self): + pass + + property name_type_: + def __get__(self): + pass + + property negative_: + def __get__(self): + pass + + property mood_: + def __get__(self): + pass + + property name_type_: + def __get__(self): + pass + + property negative_: + def __get__(self): + pass + + property number_: + def __get__(self): + pass + + property num_form_: + def __get__(self): + pass + + property num_type_: + def __get__(self): + pass + + property num_value_: + def __get__(self): + pass + + property part_form_: + def __get__(self): + pass + + property part_type_: + def __get__(self): + pass + + property person_: + def __get__(self): + pass + + property polite_: + def __get__(self): + pass + + property polarity_: + def __get__(self): + pass + + property poss_: + def __get__(self): + pass + + property prefix_: + def __get__(self): + pass + + property prep_case_: + def __get__(self): + pass + + property pron_type_: + def __get__(self): + pass + + property punct_side_: + def __get__(self): + pass + + property punct_type_: + def __get__(self): + pass + + property reflex_: + def __get__(self): + pass + + property style_: + def __get__(self): + pass + + property style_variant_: + def __get__(self): + pass + + property tense_: + def __get__(self): + pass + + property typo_: + def __get__(self): + pass + + property verb_form_: + def __get__(self): + pass + + property voice_: + def __get__(self): + pass + + property verb_type_: + def __get__(self): + pass From 932d7dde1c5e549dd0b92397c8bd6b00aab9ab0f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 14:34:54 +0100 Subject: [PATCH 052/207] Fix compile error --- setup.py | 1 + spacy/tokens/morphanalysis.pyx | 22 +++++----------------- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/setup.py b/setup.py index f193d0498..b1b3785eb 100755 --- a/setup.py +++ b/setup.py @@ -56,6 +56,7 @@ MOD_NAMES = [ "spacy.tokens.doc", "spacy.tokens.span", "spacy.tokens.token", + "spacy.tokens.morphanalysis", "spacy.tokens._retokenize", "spacy.matcher.matcher", "spacy.matcher.phrasematcher", diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index 722f97994..01ecf458b 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -2,7 +2,7 @@ from ..vocab cimport Vocab from ..typedefs cimport hash_t -cdef class Morphanalysis: +cdef class MorphAnalysis: """Control access to morphological features for a token.""" def __init__(self, Vocab vocab, features=tuple()): self.vocab = vocab @@ -118,14 +118,6 @@ cdef class Morphanalysis: def __get__(self): pass - property name_type: - def __get__(self): - pass - - property negative: - def __get__(self): - pass - property mood: def __get__(self): pass @@ -138,6 +130,10 @@ cdef class Morphanalysis: def __get__(self): pass + property noun_type: + def __get__(self): + pass + property number: def __get__(self): pass @@ -306,14 +302,6 @@ cdef class Morphanalysis: def __get__(self): pass - property name_type_: - def __get__(self): - pass - - property negative_: - def __get__(self): - pass - property number_: def __get__(self): pass From fed0371db753765425521243b5325fd09296dd4a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 17:14:57 +0100 Subject: [PATCH 053/207] Remove enums from morphology --- spacy/morphology.pxd | 385 ------ spacy/morphology.pyx | 1104 +++++++---------- spacy/pipeline/morphologizer.pyx | 6 +- spacy/structs.pxd | 1 - spacy/tests/doc/test_retokenize_merge.py | 1 - spacy/tests/morphology/test_morph_features.py | 8 +- spacy/tokens/token.pyx | 5 + 7 files changed, 487 insertions(+), 1023 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 24e54bdee..a057e8ed8 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -31,388 +31,3 @@ cdef class Morphology: cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1 - - -cdef enum univ_morph_t: - NIL = 0 - - begin_Abbr - Abbr_yes - end_Abbr - - begin_AdpType - AdpType_circ - AdpType_comprep - AdpType_prep - AdpType_post - AdpType_voc - end_AdpType - - begin_AdvType - AdvType_adadj - AdvType_cau - AdvType_deg - AdvType_ex - AdvType_loc - AdvType_man - AdvType_mod - AdvType_sta - AdvType_tim - end_AdvType - - begin_Animacy - Animacy_anim - Animacy_hum - Animacy_inan - Animacy_nhum - end_Animacy - - begin_Aspect - Aspect_freq - Aspect_imp - Aspect_mod - Aspect_none - Aspect_perf - end_Aspect - - begin_Case - Case_abe - Case_abl - Case_abs - Case_acc - Case_ade - Case_all - Case_cau - Case_com - Case_dat - Case_del - Case_dis - Case_ela - Case_ess - Case_gen - Case_ill - Case_ine - Case_ins - Case_loc - Case_lat - Case_nom - Case_par - Case_sub - Case_sup - Case_tem - Case_ter - Case_tra - Case_voc - end_Case - - begin_ConjType - ConjType_comp # cz, U - ConjType_oper # cz, U - end_ConjType - begin_Connegative - Connegative_yes # fi - end_Connegative - - begin_Definite - Definite_cons # U20 - Definite_def - Definite_ind - Definite_red - Definite_two - end_Definite - - begin_Degree - Degree_abs - Degree_cmp - Degree_comp - Degree_none - Degree_pos - Degree_sup - Degree_com - Degree_dim # du - end_Degree - - begin_Derivation - Derivation_minen # fi - Derivation_sti # fi - Derivation_inen # fi - Derivation_lainen # fi - Derivation_ja # fi - Derivation_ton # fi - Derivation_vs # fi - Derivation_ttain # fi - Derivation_ttaa # fi - end_Derivation - - begin_Echo - Echo_rdp # U - Echo_ech # U - end_Echo - - begin_Foreign - Foreign_foreign # cz, fi, U - Foreign_fscript # cz, fi, U - Foreign_tscript # cz, U - Foreign_yes # sl - end_Foreign - - begin_Gender - Gender_com - Gender_fem - Gender_masc - Gender_neut - Gender_dat_masc # bq, U - Gender_dat_fem # bq, U - Gender_erg_masc # bq - Gender_erg_fem # bq - Gender_psor_masc # cz, sl, U - Gender_psor_fem # cz, sl, U - Gender_psor_neut # sl - end_Gender - - begin_Hyph - Hyph_yes # cz, U - end_Hyph - - begin_InfForm - InfForm_one # fi - InfForm_two # fi - InfForm_three # fi - end_InfForm - - begin_Mood - Mood_cnd - Mood_imp - Mood_ind - Mood_n - Mood_pot - Mood_sub - Mood_opt - end_Mood - - begin_NameType - NameType_geo # U, cz - NameType_prs # U, cz - NameType_giv # U, cz - NameType_sur # U, cz - NameType_nat # U, cz - NameType_com # U, cz - NameType_pro # U, cz - NameType_oth # U, cz - end_NameType - - begin_Negative - Negative_neg - Negative_pos - Negative_yes - end_Negative - - begin_NounType - NounType_com # U - NounType_prop # U - NounType_class # U - end_NounType - - begin_Number - Number_com - Number_dual - Number_none - Number_plur - Number_sing - Number_ptan # bg - Number_count # bg - Number_abs_sing # bq, U - Number_abs_plur # bq, U - Number_dat_sing # bq, U - Number_dat_plur # bq, U - Number_erg_sing # bq, U - Number_erg_plur # bq, U - Number_psee_sing # U - Number_psee_plur # U - Number_psor_sing # cz, fi, sl, U - Number_psor_plur # cz, fi, sl, U - end_Number - - begin_NumForm - NumForm_digit # cz, sl, U - NumForm_roman # cz, sl, U - NumForm_word # cz, sl, U - end_NumForm - - begin_NumType - NumType_card - NumType_dist - NumType_frac - NumType_gen - NumType_mult - NumType_none - NumType_ord - NumType_sets - end_NumType - - begin_NumValue - NumValue_one # cz, U - NumValue_two # cz, U - NumValue_three # cz, U - end_NumValue - - begin_PartForm - PartForm_pres # fi - PartForm_past # fi - PartForm_agt # fi - PartForm_neg # fi - end_PartForm - - begin_PartType - PartType_mod # U - PartType_emp # U - PartType_res # U - PartType_inf # U - PartType_vbp # U - end_PartType - - begin_Person - Person_one - Person_two - Person_three - Person_none - Person_abs_one # bq, U - Person_abs_two # bq, U - Person_abs_three # bq, U - Person_dat_one # bq, U - Person_dat_two # bq, U - Person_dat_three # bq, U - Person_erg_one # bq, U - Person_erg_two # bq, U - Person_erg_three # bq, U - Person_psor_one # fi, U - Person_psor_two # fi, U - Person_psor_three # fi, U - end_Person - - begin_Polarity - Polarity_neg # U20 - Polarity_pos # U20 - end_Polarity - - begin_Polite - Polite_inf # bq, U - Polite_pol # bq, U - Polite_abs_inf # bq, U - Polite_abs_pol # bq, U - Polite_erg_inf # bq, U - Polite_erg_pol # bq, U - Polite_dat_inf # bq, U - Polite_dat_pol # bq, U - end_Polite - - begin_Poss - Poss_yes - end_Poss - - begin_Prefix - Prefix_yes # U - end_Prefix - - begin_PrepCase - PrepCase_npr # cz - PrepCase_pre # U - end_PrepCase - - begin_PronType - PronType_advPart - PronType_art - PronType_default - PronType_dem - PronType_ind - PronType_int - PronType_neg - PronType_prs - PronType_rcp - PronType_rel - PronType_tot - PronType_clit - PronType_exc # es, ca, it, fa - end_PronType - - begin_PunctSide - PunctSide_ini # U - PunctSide_fin # U - end_PunctSide - - begin_PunctType - PunctType_peri # U - PunctType_qest # U - PunctType_excl # U - PunctType_quot # U - PunctType_brck # U - PunctType_comm # U - PunctType_colo # U - PunctType_semi # U - PunctType_dash # U - end_PunctType - - begin_Reflex - Reflex_yes - end_Reflex - - begin_Style - Style_arch # cz, fi, U - Style_rare # cz, fi, U - Style_poet # cz, U - Style_norm # cz, U - Style_coll # cz, U - Style_vrnc # cz, U - Style_sing # cz, U - Style_expr # cz, U - Style_derg # cz, U - Style_vulg # cz, U - Style_yes # fi, U - end_Style - - begin_StyleVariant - StyleVariant_styleShort # cz - StyleVariant_styleBound # cz, sl - end_StyleVariant - - begin_Tense - Tense_fut - Tense_imp - Tense_past - Tense_pres - end_Tense - - begin_Typo - Typo_yes - end_Typo - - begin_VerbForm - VerbForm_fin - VerbForm_ger - VerbForm_inf - VerbForm_none - VerbForm_part - VerbForm_partFut - VerbForm_partPast - VerbForm_partPres - VerbForm_sup - VerbForm_trans - VerbForm_conv # U20 - VerbForm_gdv # la - end_VerbForm - - begin_VerbType - VerbType_aux # U - VerbType_cop # U - VerbType_mod # U - VerbType_light # U - end_VerbType - - begin_Voice - Voice_act - Voice_cau - Voice_pass - Voice_mid # gkc - Voice_int # hb - end_Voice - diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 52acfedfb..1157c2502 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -4,6 +4,7 @@ from __future__ import unicode_literals from libc.string cimport memset import srsly +from collections import Counter from .strings import get_string_id from . import symbols @@ -14,6 +15,50 @@ from .parts_of_speech import IDS as POS_IDS from .lexeme cimport Lexeme from .errors import Errors +cdef enum univ_field_t: + Field_Abbr + Field_AdpType + Field_AdvType + Field_Animacy + Field_Aspect + Field_Case + Field_ConjType + Field_Connegative + Field_Definite + Field_Degree + Field_Derivation + Field_Echo + Field_Foreign + Field_Gender + Field_Hyph + Field_InfForm + Field_Mood + Field_NameType + Field_Negative + Field_NounType + Field_Number + Field_NumForm + Field_NumType + Field_NumValue + Field_PartForm + Field_PartType + Field_Person + Field_Polite + Field_Polarity + Field_Poss + Field_Prefix + Field_PrepCase + Field_PronType + Field_PunctSide + Field_PunctType + Field_Reflex + Field_Style + Field_StyleVariant + Field_Tense + Field_Typo + Field_VerbForm + Field_Voice + Field_VerbType def _normalize_props(props): @@ -23,7 +68,7 @@ def _normalize_props(props): for key in FIELDS: if key in props: attr = '%s_%s' % (key, props[key]) - if attr in IDS: + if attr in FEATURES: props.pop(key) props[attr] = True for key, value in props.items(): @@ -43,21 +88,21 @@ def _normalize_props(props): def parse_feature(feature): - if not hasattr(feature, 'split'): - feature = NAMES[feature] - key, value = feature.split('_') - begin = 'begin_%s' % key - # Note that this includes a 0 offset for the field, for no entry - offset = IDS[feature] - IDS[begin] - field_id = FIELDS[key] - return (field_id, offset) + field = FEATURE_FIELDS[feature] + offset = FEATURE_OFFSETS[feature] + return (field, offset) + + +def get_field_id(feature): + return FEATURE_FIELDS[feature] def get_field_size(field): - begin = 'begin_%s' % field - end = 'end_%s' % field - # Extra field for no entry -- always 0 - return IDS[end] - IDS[begin] + return FIELD_SIZES[field] + + +def get_field_offset(field): + return FIELD_OFFSETS[field] cdef class Morphology: @@ -105,11 +150,9 @@ cdef class Morphology: present. Returns the hash of the new analysis. """ features = intify_features(features) - cdef univ_morph_t feature + cdef attr_t feature for feature in features: - if feature != 0 and feature not in NAMES: - print(list(NAMES.keys())[:10]) - print(NAMES.get(feature-1), NAMES.get(feature+1)) + if feature != 0 and feature not in FEATURE_NAMES: raise KeyError("Unknown feature: %d" % feature) cdef MorphAnalysisC tag tag = create_rich_tag(features) @@ -127,9 +170,10 @@ cdef class Morphology: """Update a morphological analysis with new feature values.""" tag = (<MorphAnalysisC*>self.tags.get(morph))[0] features = intify_features(features) - cdef univ_morph_t feature + cdef attr_t feature for feature in features: - set_feature(&tag, feature, 1) + field = get_field_id(feature) + set_feature(&tag, field, feature, 1) morph = self.insert(tag) return morph @@ -259,729 +303,531 @@ cpdef univ_pos_t get_int_tag(pos_): return <univ_pos_t>0 cpdef intify_features(features): - return {IDS.get(feature, feature) for feature in features} + return {get_string_id(feature) for feature in features} cdef hash_t hash_tag(MorphAnalysisC tag) nogil: return mrmr.hash64(&tag, sizeof(tag), 0) + +def get_feature_field(feature): + cdef attr_t key = get_string_id(feature) + return FEATURE_FIELDS[feature] + + cdef MorphAnalysisC create_rich_tag(features) except *: cdef MorphAnalysisC tag - cdef univ_morph_t feature + cdef attr_t feature memset(&tag, 0, sizeof(tag)) for feature in features: - set_feature(&tag, feature, 1) + field = get_field_id(feature) + set_feature(&tag, field, feature, 1) return tag + cdef tag_to_json(MorphAnalysisC tag): features = [] if tag.abbr != 0: - features.append(NAMES[tag.abbr]) + features.append(FEATURE_NAMES[tag.abbr]) if tag.adp_type != 0: - features.append(NAMES[tag.adp_type]) + features.append(FEATURE_NAMES[tag.adp_type]) if tag.adv_type != 0: - features.append(NAMES[tag.adv_type]) + features.append(FEATURE_NAMES[tag.adv_type]) if tag.animacy != 0: - features.append(NAMES[tag.animacy]) + features.append(FEATURE_NAMES[tag.animacy]) if tag.aspect != 0: - features.append(NAMES[tag.aspect]) + features.append(FEATURE_NAMES[tag.aspect]) if tag.case != 0: - features.append(NAMES[tag.case]) + features.append(FEATURE_NAMES[tag.case]) if tag.conj_type != 0: - features.append(NAMES[tag.conj_type]) + features.append(FEATURE_NAMES[tag.conj_type]) if tag.connegative != 0: - features.append(NAMES[tag.connegative]) + features.append(FEATURE_NAMES[tag.connegative]) if tag.definite != 0: - features.append(NAMES[tag.definite]) + features.append(FEATURE_NAMES[tag.definite]) if tag.degree != 0: - features.append(NAMES[tag.degree]) + features.append(FEATURE_NAMES[tag.degree]) if tag.derivation != 0: - features.append(NAMES[tag.derivation]) + features.append(FEATURE_NAMES[tag.derivation]) if tag.echo != 0: - features.append(NAMES[tag.echo]) + features.append(FEATURE_NAMES[tag.echo]) if tag.foreign != 0: - features.append(NAMES[tag.foreign]) + features.append(FEATURE_NAMES[tag.foreign]) if tag.gender != 0: - features.append(NAMES[tag.gender]) + features.append(FEATURE_NAMES[tag.gender]) if tag.hyph != 0: - features.append(NAMES[tag.hyph]) + features.append(FEATURE_NAMES[tag.hyph]) if tag.inf_form != 0: - features.append(NAMES[tag.inf_form]) + features.append(FEATURE_NAMES[tag.inf_form]) if tag.mood != 0: - features.append(NAMES[tag.mood]) + features.append(FEATURE_NAMES[tag.mood]) if tag.negative != 0: - features.append(NAMES[tag.negative]) + features.append(FEATURE_NAMES[tag.negative]) if tag.number != 0: - features.append(NAMES[tag.number]) + features.append(FEATURE_NAMES[tag.number]) if tag.name_type != 0: - features.append(NAMES[tag.name_type]) + features.append(FEATURE_NAMES[tag.name_type]) if tag.noun_type != 0: - features.append(NAMES[tag.noun_type]) + features.append(FEATURE_NAMES[tag.noun_type]) if tag.num_form != 0: - features.append(NAMES[tag.num_form]) + features.append(FEATURE_NAMES[tag.num_form]) if tag.num_type != 0: - features.append(NAMES[tag.num_type]) + features.append(FEATURE_NAMES[tag.num_type]) if tag.num_value != 0: - features.append(NAMES[tag.num_value]) + features.append(FEATURE_NAMES[tag.num_value]) if tag.part_form != 0: - features.append(NAMES[tag.part_form]) + features.append(FEATURE_NAMES[tag.part_form]) if tag.part_type != 0: - features.append(NAMES[tag.part_type]) + features.append(FEATURE_NAMES[tag.part_type]) if tag.person != 0: - features.append(NAMES[tag.person]) + features.append(FEATURE_NAMES[tag.person]) if tag.polite != 0: - features.append(NAMES[tag.polite]) + features.append(FEATURE_NAMES[tag.polite]) if tag.polarity != 0: - features.append(NAMES[tag.polarity]) + features.append(FEATURE_NAMES[tag.polarity]) if tag.poss != 0: - features.append(NAMES[tag.poss]) + features.append(FEATURE_NAMES[tag.poss]) if tag.prefix != 0: - features.append(NAMES[tag.prefix]) + features.append(FEATURE_NAMES[tag.prefix]) if tag.prep_case != 0: - features.append(NAMES[tag.prep_case]) + features.append(FEATURE_NAMES[tag.prep_case]) if tag.pron_type != 0: - features.append(NAMES[tag.pron_type]) + features.append(FEATURE_NAMES[tag.pron_type]) if tag.punct_side != 0: - features.append(NAMES[tag.punct_side]) + features.append(FEATURE_NAMES[tag.punct_side]) if tag.punct_type != 0: - features.append(NAMES[tag.punct_type]) + features.append(FEATURE_NAMES[tag.punct_type]) if tag.reflex != 0: - features.append(NAMES[tag.reflex]) + features.append(FEATURE_NAMES[tag.reflex]) if tag.style != 0: - features.append(NAMES[tag.style]) + features.append(FEATURE_NAMES[tag.style]) if tag.style_variant != 0: - features.append(NAMES[tag.style_variant]) + features.append(FEATURE_NAMES[tag.style_variant]) if tag.tense != 0: - features.append(NAMES[tag.tense]) + features.append(FEATURE_NAMES[tag.tense]) if tag.verb_form != 0: - features.append(NAMES[tag.verb_form]) + features.append(FEATURE_NAMES[tag.verb_form]) if tag.voice != 0: - features.append(NAMES[tag.voice]) + features.append(FEATURE_NAMES[tag.voice]) if tag.verb_type != 0: - features.append(NAMES[tag.verb_type]) + features.append(FEATURE_NAMES[tag.verb_type]) return features cdef MorphAnalysisC tag_from_json(json_tag): cdef MorphAnalysisC tag return tag -cdef int set_feature(MorphAnalysisC* tag, univ_morph_t feature, int value) except -1: +cdef int set_feature(MorphAnalysisC* tag, + univ_field_t field, attr_t feature, int value) except -1: if value == True: value_ = feature else: - value_ = NIL - if feature == NIL: + value_ = 0 + if feature == 0: pass - elif is_abbr_feature(feature): + elif field == Field_Abbr: tag.abbr = value_ - elif is_adp_type_feature(feature): + elif field == Field_AdpType: tag.adp_type = value_ - elif is_adv_type_feature(feature): + elif field == Field_AdvType: tag.adv_type = value_ - elif is_animacy_feature(feature): + elif field == Field_Animacy: tag.animacy = value_ - elif is_aspect_feature(feature): + elif field == Field_Aspect: tag.aspect = value_ - elif is_case_feature(feature): + elif field == Field_Case: tag.case = value_ - elif is_conj_type_feature(feature): + elif field == Field_ConjType: tag.conj_type = value_ - elif is_connegative_feature(feature): + elif field == Field_Connegative: tag.connegative = value_ - elif is_definite_feature(feature): + elif field == Field_Definite: tag.definite = value_ - elif is_degree_feature(feature): + elif field == Field_Degree: tag.degree = value_ - elif is_derivation_feature(feature): + elif field == Field_Derivation: tag.derivation = value_ - elif is_echo_feature(feature): + elif field == Field_Echo: tag.echo = value_ - elif is_foreign_feature(feature): + elif field == Field_Foreign: tag.foreign = value_ - elif is_gender_feature(feature): + elif field == Field_Gender: tag.gender = value_ - elif is_hyph_feature(feature): + elif field == Field_Hyph: tag.hyph = value_ - elif is_inf_form_feature(feature): + elif field == Field_InfForm: tag.inf_form = value_ - elif is_mood_feature(feature): + elif field == Field_Mood: tag.mood = value_ - elif is_negative_feature(feature): + elif field == Field_Negative: tag.negative = value_ - elif is_number_feature(feature): + elif field == Field_Number: tag.number = value_ - elif is_name_type_feature(feature): + elif field == Field_NameType: tag.name_type = value_ - elif is_noun_type_feature(feature): + elif field == Field_NounType: tag.noun_type = value_ - elif is_num_form_feature(feature): + elif field == Field_NumForm: tag.num_form = value_ - elif is_num_type_feature(feature): + elif field == Field_NumType: tag.num_type = value_ - elif is_num_value_feature(feature): + elif field == Field_NumValue: tag.num_value = value_ - elif is_part_form_feature(feature): + elif field == Field_PartForm: tag.part_form = value_ - elif is_part_type_feature(feature): + elif field == Field_PartType: tag.part_type = value_ - elif is_person_feature(feature): + elif field == Field_Person: tag.person = value_ - elif is_polite_feature(feature): + elif field == Field_Polite: tag.polite = value_ - elif is_polarity_feature(feature): + elif field == Field_Polarity: tag.polarity = value_ - elif is_poss_feature(feature): + elif field == Field_Poss: tag.poss = value_ - elif is_prefix_feature(feature): + elif field == Field_Prefix: tag.prefix = value_ - elif is_prep_case_feature(feature): + elif field == Field_PrepCase: tag.prep_case = value_ - elif is_pron_type_feature(feature): + elif field == Field_PronType: tag.pron_type = value_ - elif is_punct_side_feature(feature): + elif field == Field_PunctSide: tag.punct_side = value_ - elif is_punct_type_feature(feature): + elif field == Field_PunctType: tag.punct_type = value_ - elif is_reflex_feature(feature): + elif field == Field_Reflex: tag.reflex = value_ - elif is_style_feature(feature): + elif field == Field_Style: tag.style = value_ - elif is_style_variant_feature(feature): + elif field == Field_StyleVariant: tag.style_variant = value_ - elif is_tense_feature(feature): + elif field == Field_Tense: tag.tense = value_ - elif is_typo_feature(feature): + elif field == Field_Typo: tag.typo = value_ - elif is_verb_form_feature(feature): + elif field == Field_VerbForm: tag.verb_form = value_ - elif is_voice_feature(feature): + elif field == Field_Voice: tag.voice = value_ - elif is_verb_type_feature(feature): + elif field == Field_VerbType: tag.verb_type = value_ else: - raise ValueError("Unknown feature: %s (%d)" % (NAMES.get(feature), feature)) - -cdef int is_abbr_feature(univ_morph_t feature) nogil: - return feature >= begin_Abbr and feature <= end_Abbr - -cdef int is_adp_type_feature(univ_morph_t feature) nogil: - return feature >= begin_AdpType and feature <= end_AdpType - -cdef int is_adv_type_feature(univ_morph_t feature) nogil: - return feature >= begin_AdvType and feature <= end_AdvType - -cdef int is_animacy_feature(univ_morph_t feature) nogil: - return feature >= begin_Animacy and feature <= end_Animacy - -cdef int is_aspect_feature(univ_morph_t feature) nogil: - return feature >= begin_Aspect and feature <= end_Aspect - -cdef int is_case_feature(univ_morph_t feature) nogil: - return feature >= begin_Case and feature <= end_Case - -cdef int is_conj_type_feature(univ_morph_t feature) nogil: - return feature >= begin_ConjType and feature <= end_ConjType - -cdef int is_connegative_feature(univ_morph_t feature) nogil: - return feature >= begin_Connegative and feature <= end_Connegative - -cdef int is_definite_feature(univ_morph_t feature) nogil: - return feature >= begin_Definite and feature <= end_Definite - -cdef int is_degree_feature(univ_morph_t feature) nogil: - return feature >= begin_Degree and feature <= end_Degree - -cdef int is_derivation_feature(univ_morph_t feature) nogil: - return feature >= begin_Derivation and feature <= end_Derivation - -cdef int is_echo_feature(univ_morph_t feature) nogil: - return feature >= begin_Echo and feature <= end_Echo - -cdef int is_foreign_feature(univ_morph_t feature) nogil: - return feature >= begin_Foreign and feature <= end_Foreign - -cdef int is_gender_feature(univ_morph_t feature) nogil: - return feature >= begin_Gender and feature <= end_Gender - -cdef int is_hyph_feature(univ_morph_t feature) nogil: - return feature >= begin_Hyph and feature <= end_Hyph - -cdef int is_inf_form_feature(univ_morph_t feature) nogil: - return feature >= begin_InfForm and feature <= end_InfForm - -cdef int is_mood_feature(univ_morph_t feature) nogil: - return feature >= begin_Mood and feature <= end_Mood - -cdef int is_name_type_feature(univ_morph_t feature) nogil: - return feature >= begin_NameType and feature < end_NameType - -cdef int is_negative_feature(univ_morph_t feature) nogil: - return feature >= begin_Negative and feature <= end_Negative - -cdef int is_noun_type_feature(univ_morph_t feature) nogil: - return feature >= begin_NounType and feature <= end_NounType - -cdef int is_number_feature(univ_morph_t feature) nogil: - return feature >= begin_Number and feature <= end_Number - -cdef int is_num_form_feature(univ_morph_t feature) nogil: - return feature >= begin_NumForm and feature <= end_NumForm - -cdef int is_num_type_feature(univ_morph_t feature) nogil: - return feature >= begin_NumType and feature <= end_NumType - -cdef int is_num_value_feature(univ_morph_t feature) nogil: - return feature >= begin_NumValue and feature <= end_NumValue - -cdef int is_part_form_feature(univ_morph_t feature) nogil: - return feature >= begin_PartForm and feature <= end_PartForm - -cdef int is_part_type_feature(univ_morph_t feature) nogil: - return feature >= begin_PartType and feature <= end_PartType - -cdef int is_person_feature(univ_morph_t feature) nogil: - return feature >= begin_Person and feature <= end_Person - -cdef int is_polite_feature(univ_morph_t feature) nogil: - return feature >= begin_Polite and feature <= end_Polite - -cdef int is_polarity_feature(univ_morph_t feature) nogil: - return feature >= begin_Polarity and feature <= end_Polarity - -cdef int is_poss_feature(univ_morph_t feature) nogil: - return feature >= begin_Poss and feature <= end_Poss - -cdef int is_prefix_feature(univ_morph_t feature) nogil: - return feature >= begin_Prefix and feature <= end_Prefix - -cdef int is_prep_case_feature(univ_morph_t feature) nogil: - return feature >= begin_PrepCase and feature <= end_PrepCase - -cdef int is_pron_type_feature(univ_morph_t feature) nogil: - return feature >= begin_PronType and feature <= end_PronType - -cdef int is_punct_side_feature(univ_morph_t feature) nogil: - return feature >= begin_PunctSide and feature <= end_PunctSide - -cdef int is_punct_type_feature(univ_morph_t feature) nogil: - return feature >= begin_PunctType and feature <= end_PunctType - -cdef int is_reflex_feature(univ_morph_t feature) nogil: - return feature >= begin_Reflex and feature <= end_Reflex - -cdef int is_style_feature(univ_morph_t feature) nogil: - return feature >= begin_Style and feature <= end_Style - -cdef int is_style_variant_feature(univ_morph_t feature) nogil: - return feature >= begin_StyleVariant and feature <= end_StyleVariant - -cdef int is_tense_feature(univ_morph_t feature) nogil: - return feature >= begin_Tense and feature <= end_Tense - -cdef int is_typo_feature(univ_morph_t feature) nogil: - return feature >= begin_Typo and feature <= end_Typo - -cdef int is_verb_form_feature(univ_morph_t feature) nogil: - return feature >= begin_VerbForm and feature <= end_VerbForm - -cdef int is_voice_feature(univ_morph_t feature) nogil: - return feature >= begin_Voice and feature <= end_Voice - -cdef int is_verb_type_feature(univ_morph_t feature) nogil: - return feature >= begin_VerbType and feature <= end_VerbType + raise ValueError("Unknown feature: %s (%d)" % (FEATURE_NAMES.get(feature), feature)) FIELDS = { - 'Abbr': 0, - 'AdpType': 1, - 'AdvType': 2, - 'Animacy': 3, - 'Aspect': 4, - 'Case': 5, - 'ConjType': 6, - 'Connegative': 7, - 'Definite': 8, - 'Degree': 9, - 'Derivation': 10, - 'Echo': 11, - 'Foreign': 12, - 'Gender': 13, - 'Hyph': 14, - 'InfForm': 15, - 'Mood': 16, - 'NameType': 17, - 'Negative': 18, - 'Number': 19, - 'NumForm': 20, - 'NumType': 21, - 'NumValue': 22, - 'PartForm': 23, - 'PartType': 24, - 'Person': 25, - 'Polite': 26, - 'Polarity': 27, - 'Poss': 28, - 'Prefix': 29, - 'PrepCase': 30, - 'PronType': 31, - 'PunctSide': 32, - 'PunctType': 33, - 'Reflex': 34, - 'Style': 35, - 'StyleVariant': 36, - 'Tense': 37, - 'Typo': 38, - 'VerbForm': 39, - 'Voice': 40, - 'VerbType': 41 + 'Abbr': Field_Abbr, + 'AdpType': Field_AdpType, + 'AdvType': Field_AdvType, + 'Animacy': Field_Animacy, + 'Aspect': Field_Aspect, + 'Case': Field_Case, + 'ConjType': Field_ConjType, + 'Connegative': Field_Connegative, + 'Definite': Field_Definite, + 'Degree': Field_Degree, + 'Derivation': Field_Derivation, + 'Echo': Field_Echo, + 'Foreign': Field_Foreign, + 'Gender': Field_Gender, + 'Hyph': Field_Hyph, + 'InfForm': Field_InfForm, + 'Mood': Field_Mood, + 'NameType': Field_NameType, + 'Negative': Field_Negative, + 'NounType': Field_NounType, + 'Number': Field_Number, + 'NumForm': Field_NumForm, + 'NumType': Field_NumType, + 'NumValue': Field_NumValue, + 'PartForm': Field_PartForm, + 'PartType': Field_PartType, + 'Person': Field_Person, + 'Polite': Field_Polite, + 'Polarity': Field_Polarity, + 'Poss': Field_Poss, + 'Prefix': Field_Prefix, + 'PrepCase': Field_PrepCase, + 'PronType': Field_PronType, + 'PunctSide': Field_PunctSide, + 'PunctType': Field_PunctType, + 'Reflex': Field_Reflex, + 'Style': Field_Style, + 'StyleVariant': Field_StyleVariant, + 'Tense': Field_Tense, + 'Typo': Field_Typo, + 'VerbForm': Field_VerbForm, + 'Voice': Field_Voice, + 'VerbType': Field_VerbType } -IDS = { - "begin_Abbr": begin_Abbr, - "Abbr_yes": Abbr_yes , - "end_Abbr": end_Abbr, - "begin_AdpType": begin_AdpType, - "AdpType_circ": AdpType_circ, - "AdpType_comprep": AdpType_comprep, - "AdpType_prep ": AdpType_prep , - "AdpType_post": AdpType_post, - "AdpType_voc": AdpType_voc, - "end_AdpType": end_AdpType, - "begin_AdvType": begin_AdvType, - "AdvType_adadj": AdvType_adadj, - "AdvType_cau": AdvType_cau, - "AdvType_deg": AdvType_deg, - "AdvType_ex": AdvType_ex, - "AdvType_loc": AdvType_loc, - "AdvType_man": AdvType_man, - "AdvType_mod": AdvType_mod, - "AdvType_sta": AdvType_sta, - "AdvType_tim": AdvType_tim, - "end_AdvType": end_AdvType, - "begin_Animacy": begin_Animacy, - "Animacy_anim": Animacy_anim, - "Animacy_hum": Animacy_hum, - "Animacy_inan": Animacy_inan, - "Animacy_nhum": Animacy_nhum, - "end_Animacy": end_Animacy, - "begin_Aspect": begin_Aspect, - "Aspect_freq": Aspect_freq, - "Aspect_imp": Aspect_imp, - "Aspect_mod": Aspect_mod, - "Aspect_none": Aspect_none, - "Aspect_perf": Aspect_perf, - "end_Aspect": end_Aspect, - "begin_Case": begin_Case, - "Case_abe": Case_abe, - "Case_abl": Case_abl, - "Case_abs": Case_abs, - "Case_acc": Case_acc, - "Case_ade": Case_ade, - "Case_all": Case_all, - "Case_cau": Case_cau, - "Case_com": Case_com, - "Case_dat": Case_dat, - "Case_del": Case_del, - "Case_dis": Case_dis, - "Case_ela": Case_ela, - "Case_ess": Case_ess, - "Case_gen": Case_gen, - "Case_ill": Case_ill, - "Case_ine": Case_ine, - "Case_ins": Case_ins, - "Case_loc": Case_loc, - "Case_lat": Case_lat, - "Case_nom": Case_nom, - "Case_par": Case_par, - "Case_sub": Case_sub, - "Case_sup": Case_sup, - "Case_tem": Case_tem, - "Case_ter": Case_ter, - "Case_tra": Case_tra, - "Case_voc": Case_voc, - "end_Case": end_Case, - "begin_ConjType": begin_ConjType, - "ConjType_comp ": ConjType_comp , - "ConjType_oper": ConjType_oper, - "end_ConjType": end_ConjType, - "begin_Connegative": begin_Connegative, - "Connegative_yes": Connegative_yes, - "end_Connegative": end_Connegative, - "begin_Definite": begin_Definite, - "Definite_cons": Definite_cons, - "Definite_def": Definite_def, - "Definite_ind": Definite_ind, - "Definite_red": Definite_red, - "Definite_two": Definite_two, - "end_Definite": end_Definite, - "begin_Degree": begin_Degree, - "Degree_abs": Degree_abs, - "Degree_cmp": Degree_cmp, - "Degree_comp": Degree_comp, - "Degree_none": Degree_none, - "Degree_pos": Degree_pos, - "Degree_sup": Degree_sup, - "Degree_com": Degree_com, - "Degree_dim": Degree_dim, - "end_Degree": end_Degree, - "begin_Derivation": begin_Derivation, - "Derivation_minen": Derivation_minen, - "Derivation_sti": Derivation_sti, - "Derivation_inen": Derivation_inen, - "Derivation_lainen": Derivation_lainen, - "Derivation_ja": Derivation_ja, - "Derivation_ton": Derivation_ton, - "Derivation_vs": Derivation_vs, - "Derivation_ttain": Derivation_ttain, - "Derivation_ttaa": Derivation_ttaa, - "end_Derivation": end_Derivation, - "begin_Echo": begin_Echo, - "Echo_rdp": Echo_rdp, - "Echo_ech": Echo_ech, - "end_Echo": end_Echo, - "begin_Foreign": begin_Foreign, - "Foreign_foreign": Foreign_foreign, - "Foreign_fscript": Foreign_fscript, - "Foreign_tscript": Foreign_tscript, - "Foreign_yes": Foreign_yes, - "end_Foreign": end_Foreign, - "begin_Gender": begin_Gender, - "Gender_com": Gender_com, - "Gender_fem": Gender_fem, - "Gender_masc": Gender_masc, - "Gender_neut": Gender_neut, - "Gender_dat_masc": Gender_dat_masc, - "Gender_dat_fem": Gender_dat_fem, - "Gender_erg_masc": Gender_erg_masc, - "Gender_erg_fem": Gender_erg_fem, - "Gender_psor_masc": Gender_psor_masc, - "Gender_psor_fem": Gender_psor_fem, - "Gender_psor_neut": Gender_psor_neut, - "end_Gender": end_Gender, - "begin_Hyph": begin_Hyph, - "Hyph_yes": Hyph_yes, - "end_Hyph": end_Hyph, - "begin_InfForm": begin_InfForm, - "InfForm_one": InfForm_one, - "InfForm_two": InfForm_two, - "InfForm_three": InfForm_three, - "end_InfForm": end_InfForm, - "begin_Mood": begin_Mood, - "Mood_cnd": Mood_cnd, - "Mood_imp": Mood_imp, - "Mood_ind": Mood_ind, - "Mood_n": Mood_n, - "Mood_pot": Mood_pot, - "Mood_sub": Mood_sub, - "Mood_opt": Mood_opt, - "end_Mood": end_Mood, - "begin_NameType": begin_NameType, - "NameType_geo": NameType_geo, - "NameType_prs": NameType_prs, - "NameType_giv": NameType_giv, - "NameType_sur": NameType_sur, - "NameType_nat": NameType_nat, - "NameType_com": NameType_com, - "NameType_pro": NameType_pro, - "NameType_oth": NameType_oth, - "end_NameType": end_NameType, - "begin_Negative": begin_Negative, - "Negative_neg": Negative_neg, - "Negative_pos": Negative_pos, - "Negative_yes": Negative_yes, - "end_Negative": end_Negative, - "begin_NounType": begin_NounType, - "NounType_com": NounType_com, - "NounType_prop": NounType_prop, - "NounType_class": NounType_class, - "end_NounType": end_NounType, - "begin_Number": begin_Number, - "Number_com": Number_com, - "Number_dual": Number_dual, - "Number_none": Number_none, - "Number_plur": Number_plur, - "Number_sing": Number_sing, - "Number_ptan": Number_ptan, - "Number_count": Number_count, - "Number_abs_sing": Number_abs_sing, - "Number_abs_plur": Number_abs_plur, - "Number_dat_sing": Number_dat_sing, - "Number_dat_plur": Number_dat_plur, - "Number_erg_sing": Number_erg_sing, - "Number_erg_plur": Number_erg_plur, - "Number_psee_sing": Number_psee_sing, - "Number_psee_plur": Number_psee_plur, - "Number_psor_sing": Number_psor_sing, - "Number_psor_plur": Number_psor_plur, - "end_Number": end_Number, - "begin_NumForm": begin_NumForm, - "NumForm_digit": NumForm_digit, - "NumForm_roman": NumForm_roman, - "NumForm_word": NumForm_word, - "end_NumForm": end_NumForm, - "begin_NumType": begin_NumType, - "NumType_card": NumType_card, - "NumType_dist": NumType_dist, - "NumType_frac": NumType_frac, - "NumType_gen": NumType_gen, - "NumType_mult": NumType_mult, - "NumType_none": NumType_none, - "NumType_ord": NumType_ord, - "NumType_sets": NumType_sets, - "end_NumType": end_NumType, - "begin_NumValue": begin_NumValue, - "NumValue_one": NumValue_one, - "NumValue_two": NumValue_two, - "NumValue_three": NumValue_three, - "end_NumValue": end_NumValue, - "begin_PartForm": begin_PartForm, - "PartForm_pres": PartForm_pres, - "PartForm_past": PartForm_past, - "PartForm_agt": PartForm_agt, - "PartForm_neg": PartForm_neg, - "end_PartForm": end_PartForm, - "begin_PartType": begin_PartType, - "PartType_mod": PartType_mod, - "PartType_emp": PartType_emp, - "PartType_res": PartType_res, - "PartType_inf": PartType_inf, - "PartType_vbp": PartType_vbp, - "end_PartType": end_PartType, +FEATURES = [ + "Abbr_yes", + "AdpType_circ", + "AdpType_comprep", + "AdpType_prep ", + "AdpType_post", + "AdpType_voc", + "AdvType_adadj," + "AdvType_cau", + "AdvType_deg", + "AdvType_ex", + "AdvType_loc", + "AdvType_man", + "AdvType_mod", + "AdvType_sta", + "AdvType_tim", + "Animacy_anim", + "Animacy_hum", + "Animacy_inan", + "Animacy_nhum", + "Aspect_freq", + "Aspect_imp", + "Aspect_mod", + "Aspect_none", + "Aspect_perf", + "Case_abe", + "Case_abl", + "Case_abs", + "Case_acc", + "Case_ade", + "Case_all", + "Case_cau", + "Case_com", + "Case_dat", + "Case_del", + "Case_dis", + "Case_ela", + "Case_ess", + "Case_gen", + "Case_ill", + "Case_ine", + "Case_ins", + "Case_loc", + "Case_lat", + "Case_nom", + "Case_par", + "Case_sub", + "Case_sup", + "Case_tem", + "Case_ter", + "Case_tra", + "Case_voc", + "ConjType_comp", + "ConjType_oper", + "Connegative_yes", + "Definite_cons", + "Definite_def", + "Definite_ind", + "Definite_red", + "Definite_two", + "Degree_abs", + "Degree_cmp", + "Degree_comp", + "Degree_none", + "Degree_pos", + "Degree_sup", + "Degree_com", + "Degree_dim", + "Derivation_minen", + "Derivation_sti", + "Derivation_inen", + "Derivation_lainen", + "Derivation_ja", + "Derivation_ton", + "Derivation_vs", + "Derivation_ttain", + "Derivation_ttaa", + "Echo_rdp", + "Echo_ech", + "Foreign_foreign", + "Foreign_fscript", + "Foreign_tscript", + "Foreign_yes", + "Gender_com", + "Gender_fem", + "Gender_masc", + "Gender_neut", + "Gender_dat_masc", + "Gender_dat_fem", + "Gender_erg_masc", + "Gender_erg_fem", + "Gender_psor_masc", + "Gender_psor_fem", + "Gender_psor_neut", + "Hyph_yes", + "InfForm_one", + "InfForm_two", + "InfForm_three", + "Mood_cnd", + "Mood_imp", + "Mood_ind", + "Mood_n", + "Mood_pot", + "Mood_sub", + "Mood_opt", + "NameType_geo", + "NameType_prs", + "NameType_giv", + "NameType_sur", + "NameType_nat", + "NameType_com", + "NameType_pro", + "NameType_oth", + "Negative_neg", + "Negative_pos", + "Negative_yes", + "NounType_com", + "NounType_prop", + "NounType_class", + "Number_com", + "Number_dual", + "Number_none", + "Number_plur", + "Number_sing", + "Number_ptan", + "Number_count", + "Number_abs_sing", + "Number_abs_plur", + "Number_dat_sing", + "Number_dat_plur", + "Number_erg_sing", + "Number_erg_plur", + "Number_psee_sing", + "Number_psee_plur", + "Number_psor_sing", + "Number_psor_plur", + "NumForm_digit", + "NumForm_roman", + "NumForm_word", + "NumType_card", + "NumType_dist", + "NumType_frac", + "NumType_gen", + "NumType_mult", + "NumType_none", + "NumType_ord", + "NumType_sets", + "NumValue_one", + "NumValue_two", + "NumValue_three", + "PartForm_pres", + "PartForm_past", + "PartForm_agt", + "PartForm_neg", + "PartType_mod", + "PartType_emp", + "PartType_res", + "PartType_inf", + "PartType_vbp", + "Person_one", + "Person_two", + "Person_three", + "Person_none", + "Person_abs_one", + "Person_abs_two", + "Person_abs_three", + "Person_dat_one", + "Person_dat_two", + "Person_dat_three", + "Person_erg_one", + "Person_erg_two", + "Person_erg_three", + "Person_psor_one", + "Person_psor_two", + "Person_psor_three", + "Polarity_neg", + "Polarity_pos", + "Polite_inf", + "Polite_pol", + "Polite_abs_inf", + "Polite_abs_pol", + "Polite_erg_inf", + "Polite_erg_pol", + "Polite_dat_inf", + "Polite_dat_pol", + "Poss_yes", + "Prefix_yes", + "PrepCase_npr", + "PrepCase_pre", + "PronType_advPart", + "PronType_art", + "PronType_default", + "PronType_dem", + "PronType_ind", + "PronType_int", + "PronType_neg", + "PronType_prs", + "PronType_rcp", + "PronType_rel", + "PronType_tot", + "PronType_clit", + "PronType_exc", + "PunctSide_ini", + "PunctSide_fin", + "PunctType_peri", + "PunctType_qest", + "PunctType_excl", + "PunctType_quot", + "PunctType_brck", + "PunctType_comm", + "PunctType_colo", + "PunctType_semi", + "PunctType_dash", + "Reflex_yes", + "Style_arch", + "Style_rare", + "Style_poet", + "Style_norm", + "Style_coll", + "Style_vrnc", + "Style_sing", + "Style_expr", + "Style_derg", + "Style_vulg", + "Style_yes", + "StyleVariant_styleShort", + "StyleVariant_styleBound", + "Tense_fut", + "Tense_imp", + "Tense_past", + "Tense_pres", + "Typo_yes", + "VerbForm_fin", + "VerbForm_ger", + "VerbForm_inf", + "VerbForm_none", + "VerbForm_part", + "VerbForm_partFut", + "VerbForm_partPast", + "VerbForm_partPres", + "VerbForm_sup", + "VerbForm_trans", + "VerbForm_conv", + "VerbForm_gdv", + "VerbType_aux", + "VerbType_cop", + "VerbType_mod", + "VerbType_light", + "Voice_act", + "Voice_cau", + "Voice_pass", + "Voice_mid", + "Voice_int", +] - "begin_Person": begin_Person, - "Person_one": Person_one, - "Person_two": Person_two, - "Person_three": Person_three, - "Person_none": Person_none, - "Person_abs_one": Person_abs_one, - "Person_abs_two": Person_abs_two, - "Person_abs_three": Person_abs_three, - "Person_dat_one": Person_dat_one, - "Person_dat_two": Person_dat_two, - "Person_dat_three": Person_dat_three, - "Person_erg_one": Person_erg_one, - "Person_erg_two": Person_erg_two, - "Person_erg_three": Person_erg_three, - "Person_psor_one": Person_psor_one, - "Person_psor_two": Person_psor_two, - "Person_psor_three": Person_psor_three, - "end_Person": end_Person, - "begin_Polarity": begin_Polarity, - "Polarity_neg": Polarity_neg, - "Polarity_pos": Polarity_pos, - "end_Polarity": end_Polarity, - "begin_Polite": begin_Polite, - "Polite_inf": Polite_inf, - "Polite_pol": Polite_pol, - "Polite_abs_inf": Polite_abs_inf, - "Polite_abs_pol": Polite_abs_pol, - "Polite_erg_inf": Polite_erg_inf, - "Polite_erg_pol": Polite_erg_pol, - "Polite_dat_inf": Polite_dat_inf, - "Polite_dat_pol": Polite_dat_pol, - "end_Polite": end_Polite, - "begin_Poss": begin_Poss, - "Poss_yes": Poss_yes, - "end_Poss": end_Poss, - "begin_Prefix": begin_Prefix, - "Prefix_yes": Prefix_yes, - "end_Prefix": end_Prefix, - "begin_PrepCase": begin_PrepCase, - "PrepCase_npr": PrepCase_npr, - "PrepCase_pre": PrepCase_pre, - "end_PrepCase": end_PrepCase, - "begin_PronType": begin_PronType, - "PronType_advPart": PronType_advPart, - "PronType_art": PronType_art, - "PronType_default": PronType_default, - "PronType_dem": PronType_dem, - "PronType_ind": PronType_ind, - "PronType_int": PronType_int, - "PronType_neg": PronType_neg, - "PronType_prs": PronType_prs, - "PronType_rcp": PronType_rcp, - "PronType_rel": PronType_rel, - "PronType_tot": PronType_tot, - "PronType_clit": PronType_clit, - "PronType_exc": PronType_exc, - "end_PronType": end_PronType, - "begin_PunctSide": begin_PunctSide, - "PunctSide_ini": PunctSide_ini, - "PunctSide_fin": PunctSide_fin, - "end_PunctSide": end_PunctSide, - "begin_PunctType": begin_PunctType, - "PunctType_peri": PunctType_peri, - "PunctType_qest": PunctType_qest, - "PunctType_excl": PunctType_excl, - "PunctType_quot": PunctType_quot, - "PunctType_brck": PunctType_brck, - "PunctType_comm": PunctType_comm, - "PunctType_colo": PunctType_colo, - "PunctType_semi": PunctType_semi, - "PunctType_dash": PunctType_dash, - "end_PunctType": end_PunctType, - "begin_Reflex": begin_Reflex, - "Reflex_yes": Reflex_yes, - "end_Reflex": end_Reflex, - "begin_Style": begin_Style, - "Style_arch": Style_arch, - "Style_rare": Style_rare, - "Style_poet": Style_poet, - "Style_norm": Style_norm, - "Style_coll": Style_coll, - "Style_vrnc": Style_vrnc, - "Style_sing": Style_sing, - "Style_expr": Style_expr, - "Style_derg": Style_derg, - "Style_vulg": Style_vulg, - "Style_yes": Style_yes, - "end_Style": end_Style, - "begin_StyleVariant": begin_StyleVariant, - "StyleVariant_styleShort": StyleVariant_styleShort, - "StyleVariant_styleBound": StyleVariant_styleBound, - "end_StyleVariant": end_StyleVariant, - "begin_Tense": begin_Tense, - "Tense_fut": Tense_fut, - "Tense_imp": Tense_imp, - "Tense_past": Tense_past, - "Tense_pres": Tense_pres, - "end_Tense": end_Tense, - "begin_Typo": begin_Typo, - "Typo_yes": Typo_yes, - "end_Typo": end_Typo, - "begin_VerbForm": begin_VerbForm, - "VerbForm_fin": VerbForm_fin, - "VerbForm_ger": VerbForm_ger, - "VerbForm_inf": VerbForm_inf, - "VerbForm_none": VerbForm_none, - "VerbForm_part": VerbForm_part, - "VerbForm_partFut": VerbForm_partFut, - "VerbForm_partPast": VerbForm_partPast, - "VerbForm_partPres": VerbForm_partPres, - "VerbForm_sup": VerbForm_sup, - "VerbForm_trans": VerbForm_trans, - "VerbForm_conv": VerbForm_conv, - "VerbForm_gdv": VerbForm_gdv, - "end_VerbForm": end_VerbForm, - "begin_VerbType": begin_VerbType, - "VerbType_aux": VerbType_aux, - "VerbType_cop": VerbType_cop, - "VerbType_mod": VerbType_mod, - "VerbType_light": VerbType_light, - "end_VerbType": end_VerbType, - "begin_Voice": begin_Voice, - "Voice_act": Voice_act, - "Voice_cau": Voice_cau, - "Voice_pass": Voice_pass, - "Voice_mid": Voice_mid, - "Voice_int": Voice_int, - "end_Voice": end_Voice, -} +FEATURE_NAMES = {get_string_id(name): name for name in FEATURES} +FEATURE_FIELDS = {feature: FIELDS[feature.split('_', 1)[0]] for feature in FEATURES} +for feat_id, name in FEATURE_NAMES.items(): + FEATURE_FIELDS[feat_id] = FEATURE_FIELDS[name] -FIELD_SIZES = [get_field_size(field) for field in FIELDS] - -NAMES = {value: key for key, value in IDS.items()} -# Unfortunate hack here, to work around problem with long cpdef enum -# (which is generating an enormous amount of C++ in Cython 0.24+) -# We keep the enum cdef, and just make sure the names are available to Python -locals().update(IDS) +FIELD_SIZES = Counter(FEATURE_FIELDS.values()) +FEATURE_OFFSETS = {} +FIELD_OFFSETS = {} +_seen_fields = Counter() +for i, feature in enumerate(FEATURES): + field = FEATURE_FIELDS[feature] + FEATURE_OFFSETS[feature] = _seen_fields[field] + if _seen_fields == 0: + FIELD_OFFSETS[field] = i + _seen_fields[field] += 1 diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 820567e71..9f25ba357 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -16,7 +16,7 @@ from ..compat import basestring_ from ..tokens.doc cimport Doc from ..vocab cimport Vocab from ..morphology cimport Morphology -from ..morphology import parse_feature, IDS, FIELDS, FIELD_SIZES, NAMES +from ..morphology import get_field_size, get_field_offset, parse_feature, FIELDS class Morphologizer(Pipe): @@ -27,7 +27,7 @@ class Morphologizer(Pipe): if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'): raise ValueError(TempErrors.T008) if attr_nums is None: - attr_nums = list(FIELD_SIZES) + attr_nums = [get_field_size(name) for name in FIELDS] return build_morphologizer_model(attr_nums, **cfg) def __init__(self, vocab, model=True, **cfg): @@ -76,7 +76,7 @@ class Morphologizer(Pipe): cdef Doc doc cdef Vocab vocab = self.vocab field_names = list(FIELDS) - offsets = [IDS['begin_%s' % field] for field in field_names] + offsets = [get_field_offset(field) for field in field_names] for i, doc in enumerate(docs): doc_scores = batch_scores[i] doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes) diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 7452123c0..a4daa9b94 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -2,7 +2,6 @@ from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t from .typedefs cimport flags_t, attr_t, hash_t from .parts_of_speech cimport univ_pos_t -from .morphology cimport univ_morph_t cdef struct LexemeC: diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 4d4a70e30..b62e69f6c 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -69,7 +69,6 @@ def test_doc_retokenize_retokenizer_attrs(en_tokenizer): assert doc[4].ent_type_ == "ORG" -@pytest.mark.xfail def test_doc_retokenize_lex_attrs(en_tokenizer): """Test that lexical attributes can be changed (see #2390).""" doc = en_tokenizer("WKRO played beach boys songs") diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py index 32cc665af..dcb0b32ff 100644 --- a/spacy/tests/morphology/test_morph_features.py +++ b/spacy/tests/morphology/test_morph_features.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import pytest from ...morphology import Morphology -from ...strings import StringStore +from ...strings import StringStore, get_string_id from ...lemmatizer import Lemmatizer from ...morphology import * @@ -17,14 +17,14 @@ def test_add_morphology_with_string_names(morphology): morphology.add({"Case_gen", "Number_sing"}) def test_add_morphology_with_int_ids(morphology): - morphology.add({Case_gen, Number_sing}) + morphology.add({get_string_id("Case_gen"), get_string_id("Number_sing")}) def test_add_morphology_with_mix_strings_and_ints(morphology): - morphology.add({PunctSide_ini, 'VerbType_aux'}) + morphology.add({get_string_id("PunctSide_ini"), 'VerbType_aux'}) def test_morphology_tags_hash_distinctly(morphology): - tag1 = morphology.add({PunctSide_ini, 'VerbType_aux'}) + tag1 = morphology.add({"PunctSide_ini", 'VerbType_aux'}) tag2 = morphology.add({"Case_gen", 'Number_sing'}) assert tag1 != tag2 diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index df596ceb5..1b60a3271 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -22,6 +22,7 @@ from ..compat import is_config from ..errors import Errors, Warnings, user_warning, models_warning from .. import util from .underscore import Underscore, get_ext_args +from .morphanalysis cimport MorphAnalysis cdef class Token: @@ -176,6 +177,10 @@ cdef class Token: def __get__(self): return self.c.morph + property morph: + def __get__(self): + return MorphAnalysis.from_id(self.vocab, self.c.morph) + property lex_id: """RETURNS (int): Sequential ID of the token's lexical type.""" def __get__(self): From 0ad09b16ad7c5686bc22b02603c7bbe126947d81 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 17:24:57 +0100 Subject: [PATCH 054/207] Add header for morphanalysis --- spacy/tokens/morphanalysis.pxd | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 spacy/tokens/morphanalysis.pxd diff --git a/spacy/tokens/morphanalysis.pxd b/spacy/tokens/morphanalysis.pxd new file mode 100644 index 000000000..22844454a --- /dev/null +++ b/spacy/tokens/morphanalysis.pxd @@ -0,0 +1,9 @@ +from ..vocab cimport Vocab +from ..typedefs cimport hash_t +from ..structs cimport MorphAnalysisC + + +cdef class MorphAnalysis: + cdef readonly Vocab vocab + cdef hash_t key + cdef MorphAnalysisC c From e585b5045832d38ffab00ed4b5a39c7d8f2b6e9f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 18:32:09 +0100 Subject: [PATCH 055/207] Fix features in English tag map --- spacy/lang/en/tag_map.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py index eda4fa1c2..151b42c0c 100644 --- a/spacy/lang/en/tag_map.py +++ b/spacy/lang/en/tag_map.py @@ -17,7 +17,7 @@ TAG_MAP = { "$": {POS: SYM, "Other": {"SymType": "currency"}}, "#": {POS: SYM, "Other": {"SymType": "numbersign"}}, "AFX": {POS: ADJ, "Hyph": "yes"}, - "CC": {POS: CCONJ, "ConjType": "coor"}, + "CC": {POS: CCONJ, "ConjType": "comp"}, "CD": {POS: NUM, "NumType": "card"}, "DT": {POS: DET}, "EX": {POS: ADV, "AdvType": "ex"}, @@ -56,7 +56,7 @@ TAG_MAP = { "VerbForm": "fin", "Tense": "pres", "Number": "sing", - "Person": 3, + "Person": "three", }, "WDT": {POS: ADJ, "PronType": "int,rel"}, "WP": {POS: NOUN, "PronType": "int,rel"}, From 2669190b858af297ffeb28273e8bbe5eb5516194 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 18:32:36 +0100 Subject: [PATCH 056/207] Normalize props for morph exceptions --- spacy/morphology.pyx | 101 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 98 insertions(+), 3 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 1157c2502..4e3ec1cf8 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -15,6 +15,7 @@ from .parts_of_speech import IDS as POS_IDS from .lexeme cimport Lexeme from .errors import Errors + cdef enum univ_field_t: Field_Abbr Field_AdpType @@ -138,6 +139,7 @@ cdef class Morphology: self.exc = {} if exc is not None: for (tag, orth), attrs in exc.items(): + attrs = _normalize_props(attrs) self.add_special_case( self.strings.as_string(tag), self.strings.as_string(orth), attrs) @@ -149,11 +151,13 @@ cdef class Morphology: """Insert a morphological analysis in the morphology table, if not already present. Returns the hash of the new analysis. """ + for f in features: + self.strings.add(f) features = intify_features(features) cdef attr_t feature for feature in features: if feature != 0 and feature not in FEATURE_NAMES: - raise KeyError("Unknown feature: %d" % feature) + raise KeyError("Unknown feature: %s" % self.strings[feature]) cdef MorphAnalysisC tag tag = create_rich_tag(features) cdef hash_t key = self.insert(tag) @@ -263,8 +267,7 @@ cdef class Morphology: token.lemma = lemma token.pos = <univ_pos_t>pos token.tag = self.strings[tag_str] - #token.morph = self.add(features) - token.morph = 0 + token.morph = self.add(features) if (self.tag_names[tag_id], token.lex.orth) in self.exc: self._assign_tag_from_exceptions(token, tag_id) @@ -412,9 +415,101 @@ cdef tag_to_json(MorphAnalysisC tag): features.append(FEATURE_NAMES[tag.verb_type]) return features + cdef MorphAnalysisC tag_from_json(json_tag): cdef MorphAnalysisC tag return tag + + +cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil: + if tag.abbr == feature: + return 1 + elif tag.adp_type == feature: + return 1 + elif tag.adv_type == feature: + return 1 + elif tag.animacy == feature: + return 1 + elif tag.aspect == feature: + return 1 + elif tag.case == feature: + return 1 + elif tag.conj_type == feature: + return 1 + elif tag.connegative == feature: + return 1 + elif tag.definite == feature: + return 1 + elif tag.degree == feature: + return 1 + elif tag.derivation == feature: + return 1 + elif tag.echo == feature: + return 1 + elif tag.foreign == feature: + return 1 + elif tag.gender == feature: + return 1 + elif tag.hyph == feature: + return 1 + elif tag.inf_form == feature: + return 1 + elif tag.mood == feature: + return 1 + elif tag.negative == feature: + return 1 + elif tag.number == feature: + return 1 + elif tag.name_type == feature: + return 1 + elif tag.noun_type == feature: + return 1 + elif tag.num_form == feature: + return 1 + elif tag.num_type == feature: + return 1 + elif tag.num_value == feature: + return 1 + elif tag.part_form == feature: + return 1 + elif tag.part_type == feature: + return 1 + elif tag.person == feature: + return 1 + elif tag.polite == feature: + return 1 + elif tag.polarity == feature: + return 1 + elif tag.poss == feature: + return 1 + elif tag.prefix == feature: + return 1 + elif tag.prep_case == feature: + return 1 + elif tag.pron_type == feature: + return 1 + elif tag.punct_side == feature: + return 1 + elif tag.punct_type == feature: + return 1 + elif tag.reflex == feature: + return 1 + elif tag.style == feature: + return 1 + elif tag.style_variant == feature: + return 1 + elif tag.tense == feature: + return 1 + elif tag.typo == feature: + return 1 + elif tag.verb_form == feature: + return 1 + elif tag.voice == feature: + return 1 + elif tag.verb_type == feature: + return 1 + else: + return 0 cdef int set_feature(MorphAnalysisC* tag, univ_field_t field, attr_t feature, int value) except -1: From 357066ee2f40d00369b53e793f2e36c4b5df5041 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 18:32:51 +0100 Subject: [PATCH 057/207] Work on morphanalysis class --- spacy/tokens/morphanalysis.pyx | 140 ++++++++++++++++++--------------- 1 file changed, 78 insertions(+), 62 deletions(-) diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index 01ecf458b..11e65c19f 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -1,5 +1,10 @@ +from libc.string cimport memset + from ..vocab cimport Vocab -from ..typedefs cimport hash_t +from ..typedefs cimport hash_t, attr_t +from ..morphology cimport check_feature, tag_to_json + +from ..strings import get_string_id cdef class MorphAnalysis: @@ -8,223 +13,234 @@ cdef class MorphAnalysis: self.vocab = vocab self.key = self.vocab.morphology.add(features) analysis = <const MorphAnalysisC*>self.vocab.morphology.tags.get(self.key) - self.c = analysis[0] + if analysis is not NULL: + self.c = analysis[0] + else: + memset(&self.c, 0, sizeof(self.c)) @classmethod - def from_id(self, Vocab vocab, hash_t key): - pass + def from_id(cls, Vocab vocab, hash_t key): + cdef MorphAnalysis morph = MorphAnalysis.__new__(MorphAnalysis, vocab) + morph.key = key + analysis = <const MorphAnalysisC*>vocab.morphology.tags.get(key) + if analysis is not NULL: + morph.c = analysis[0] + else: + memset(&morph.c, 0, sizeof(morph.c)) + return morph def __contains__(self, feature): - pass + cdef attr_t feat_id = get_string_id(feature) + return check_feature(&self.c, feat_id) def __iter__(self): - pass + raise NotImplementedError def __len__(self): - pass + raise NotImplementedError def __str__(self): - pass + raise NotImplementedError def __repr__(self): - pass + raise NotImplementedError def __hash__(self): - pass + raise NotImplementedError - def get(self, name): - pass + def get(self, field): + raise NotImplementedError def to_json(self): - pass + return tag_to_json(self.c) @property def is_base_form(self): - pass + raise NotImplementedError @property def pos(self): - pass + return self.c.pos @property def pos_(self): - pass + return self.vocab.strings[self.c.pos] - @property - def id(self): - pass + property id: + def __get__(self): + return self.key property abbr: def __get__(self): - pass + return self.c.abbr property adp_type: def __get__(self): - pass + return self.c.adp_type property adv_type: def __get__(self): - pass + return self.c.adv_type property animacy: def __get__(self): - pass + return self.c.animacy property aspect: def __get__(self): - pass + return self.c.aspect property case: def __get__(self): - pass + return self.c.case property conj_type: def __get__(self): - pass + return self.c.conj_type property connegative: def __get__(self): - pass + return self.c.connegative property definite: def __get__(self): - pass + return self.c.definite property degree: def __get__(self): - pass + return self.c.degree property derivation: def __get__(self): - pass + return self.c.derivation property echo: def __get__(self): - pass + return self.c.echo property foreign: def __get__(self): - pass + return self.c.foreign property gender: def __get__(self): - pass + return self.c.gender property hyph: def __get__(self): - pass + return self.c.hyph property inf_form: def __get__(self): - pass + return self.c.inf_form property mood: def __get__(self): - pass + return self.c.mood property name_type: def __get__(self): - pass + return self.c.name_type property negative: def __get__(self): - pass + return self.c.negative property noun_type: def __get__(self): - pass + return self.c.noun_type property number: def __get__(self): - pass + return self.c.number property num_form: def __get__(self): - pass + return self.c.num_form property num_type: def __get__(self): - pass + return self.c.num_type property num_value: def __get__(self): - pass + return self.c.num_value property part_form: def __get__(self): - pass + return self.c.part_form property part_type: def __get__(self): - pass + return self.c.part_type property person: def __get__(self): - pass + return self.c.person property polite: def __get__(self): - pass + return self.c.polite property polarity: def __get__(self): - pass + return self.c.polarity property poss: def __get__(self): - pass + return self.c.poss property prefix: def __get__(self): - pass + return self.c.prefix property prep_case: def __get__(self): - pass + return self.c.prep_case property pron_type: def __get__(self): - pass + return self.c.pron_type property punct_side: def __get__(self): - pass + return self.c.punct_side property punct_type: def __get__(self): - pass + return self.c.punct_type property reflex: def __get__(self): - pass + return self.c.reflex property style: def __get__(self): - pass + return self.c.style property style_variant: def __get__(self): - pass + return self.c.style_variant property tense: def __get__(self): - pass + return self.c.tense property typo: def __get__(self): - pass + return self.c.typo property verb_form: def __get__(self): - pass + return self.c.verb_form property voice: def __get__(self): - pass + return self.c.voice property verb_type: def __get__(self): - pass + return self.c.verb_type property abbr_: def __get__(self): From c1888b05d2c57430c5cce9a197f408cc3c250b33 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 18:33:06 +0100 Subject: [PATCH 058/207] Export helper functions for morphology --- spacy/morphology.pxd | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index a057e8ed8..30c29c1c7 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -31,3 +31,8 @@ cdef class Morphology: cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1 + + +cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil + +cdef tag_to_json(MorphAnalysisC tag) From 1a10bf29bcafa8676a84c8178a16c8db87e8a6fb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 18:33:17 +0100 Subject: [PATCH 059/207] Remove morph_key from token api --- spacy/tokens/token.pyx | 4 ---- 1 file changed, 4 deletions(-) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 1b60a3271..7249a2b60 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -173,10 +173,6 @@ cdef class Token: return (numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)) - property morph_key: - def __get__(self): - return self.c.morph - property morph: def __get__(self): return MorphAnalysis.from_id(self.vocab, self.c.morph) From 3a667833d1a37ffa826ebb3ca76b057c1ce712e3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 21:57:43 +0100 Subject: [PATCH 060/207] Fix morphological features in de tag_map --- spacy/lang/de/tag_map.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/lang/de/tag_map.py b/spacy/lang/de/tag_map.py index 3bb6247c4..394478145 100644 --- a/spacy/lang/de/tag_map.py +++ b/spacy/lang/de/tag_map.py @@ -10,7 +10,7 @@ TAG_MAP = { "$,": {POS: PUNCT, "PunctType": "comm"}, "$.": {POS: PUNCT, "PunctType": "peri"}, "ADJA": {POS: ADJ}, - "ADJD": {POS: ADJ, "Variant": "short"}, + "ADJD": {POS: ADJ}, "ADV": {POS: ADV}, "APPO": {POS: ADP, "AdpType": "post"}, "APPR": {POS: ADP, "AdpType": "prep"}, @@ -32,7 +32,7 @@ TAG_MAP = { "PDAT": {POS: DET, "PronType": "dem"}, "PDS": {POS: PRON, "PronType": "dem"}, "PIAT": {POS: DET, "PronType": "ind|neg|tot"}, - "PIDAT": {POS: DET, "AdjType": "pdt", "PronType": "ind|neg|tot"}, + "PIDAT": {POS: DET, "PronType": "ind|neg|tot"}, "PIS": {POS: PRON, "PronType": "ind|neg|tot"}, "PPER": {POS: PRON, "PronType": "prs"}, "PPOSAT": {POS: DET, "Poss": "yes", "PronType": "prs"}, @@ -42,7 +42,7 @@ TAG_MAP = { "PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"}, "PTKA": {POS: PART}, "PTKANT": {POS: PART, "PartType": "res"}, - "PTKNEG": {POS: PART, "Polarity": "Neg"}, + "PTKNEG": {POS: PART, "Polarity": "neg"}, "PTKVZ": {POS: PART, "PartType": "vbp"}, "PTKZU": {POS: PART, "PartType": "inf"}, "PWAT": {POS: DET, "PronType": "int"}, From 7afe56a3602e4a7876f5c87ac5b4f3531fd60487 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 21:57:56 +0100 Subject: [PATCH 061/207] Fix morphological features in en tag_map --- spacy/lang/en/tag_map.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py index 151b42c0c..88fc2db3e 100644 --- a/spacy/lang/en/tag_map.py +++ b/spacy/lang/en/tag_map.py @@ -58,10 +58,10 @@ TAG_MAP = { "Number": "sing", "Person": "three", }, - "WDT": {POS: ADJ, "PronType": "int,rel"}, - "WP": {POS: NOUN, "PronType": "int,rel"}, - "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int,rel"}, - "WRB": {POS: ADV, "PronType": "int,rel"}, + "WDT": {POS: ADJ, "PronType": "rel"}, + "WP": {POS: NOUN, "PronType": "rel"}, + "WP$": {POS: ADJ, "Poss": "yes", "PronType": "rel"}, + "WRB": {POS: ADV, "PronType": "rel"}, "ADD": {POS: X}, "NFP": {POS: PUNCT}, "GW": {POS: X}, From 00cfadbf63ca59354c2f57a5b28e85d00097a190 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 21:58:16 +0100 Subject: [PATCH 062/207] Fix obsolete data in English tokenizer exceptions --- spacy/lang/en/tokenizer_exceptions.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index 5063319a6..3c750d627 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -35,8 +35,6 @@ for pron in ["i"]: LEMMA: "be", NORM: "am", TAG: "VBP", - "tenspect": 1, - "number": 1, }, ] From 987ee6e884c21d129d8cc66b04c167fa243ca77b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 21:58:43 +0100 Subject: [PATCH 063/207] Fix data reading in morphology --- spacy/morphology.pyx | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 4e3ec1cf8..d169c6d31 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -6,6 +6,7 @@ from libc.string cimport memset import srsly from collections import Counter +from .compat import basestring_ from .strings import get_string_id from . import symbols from .attrs cimport POS, IS_SPACE @@ -68,7 +69,8 @@ def _normalize_props(props): props = dict(props) for key in FIELDS: if key in props: - attr = '%s_%s' % (key, props[key]) + value = str(props[key]).lower() + attr = '%s_%s' % (key, value) if attr in FEATURES: props.pop(key) props[attr] = True @@ -81,9 +83,11 @@ def _normalize_props(props): out[key] = value elif isinstance(key, int): out[key] = value + elif value is True: + out[key] = value elif key.lower() == 'pos': out[POS] = POS_IDS[value.upper()] - else: + elif key.lower() != 'morph': out[key] = value return out @@ -132,6 +136,7 @@ cdef class Morphology: self.reverse_index = {} for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): attrs = _normalize_props(attrs) + self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES}) self.tag_map[tag_str] = dict(attrs) self.reverse_index[self.strings.add(tag_str)] = i @@ -152,7 +157,8 @@ cdef class Morphology: present. Returns the hash of the new analysis. """ for f in features: - self.strings.add(f) + if isinstance(f, basestring_): + self.strings.add(f) features = intify_features(features) cdef attr_t feature for feature in features: @@ -213,6 +219,7 @@ cdef class Morphology: """ attrs = dict(attrs) attrs = _normalize_props(attrs) + self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES}) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) self.exc[(tag_str, self.strings.add(orth_str))] = attrs @@ -659,7 +666,7 @@ FEATURES = [ "Abbr_yes", "AdpType_circ", "AdpType_comprep", - "AdpType_prep ", + "AdpType_prep", "AdpType_post", "AdpType_voc", "AdvType_adadj," From dd9ea478c5f4661d2e058e01a91b3471c701de41 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 21:59:03 +0100 Subject: [PATCH 064/207] Fix intify_attrs function for obsolete data --- spacy/attrs.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index ed1f39a3f..f441007fe 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -143,8 +143,12 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): for name, value in stringy_attrs.items(): if isinstance(name, int): int_key = name - else: + elif name in IDS: + int_key = IDS[name] + elif name.upper() in IDS: int_key = IDS[name.upper()] + else: + continue if strings_map is not None and isinstance(value, basestring): if hasattr(strings_map, 'add'): value = strings_map.add(value) From a40d73cb2ad2320b3ef06ded55f997bca31877be Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 7 Mar 2019 21:59:25 +0100 Subject: [PATCH 065/207] Build out morphological analysis API --- spacy/tokens/morphanalysis.pyx | 84 +++++++++++++++++----------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index 11e65c19f..59ce6daa0 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -244,168 +244,168 @@ cdef class MorphAnalysis: property abbr_: def __get__(self): - pass + return self.vocab.strings[self.c.abbr_] property adp_type_: def __get__(self): - pass + return self.vocab.strings[self.c.adp_type_] property adv_type_: def __get__(self): - pass + return self.vocab.strings[self.c.adv_type_] property animacy_: def __get__(self): - pass + return self.vocab.strings[self.c.animacy_] property aspect_: def __get__(self): - pass + return self.vocab.strings[self.c.aspect_] property case_: def __get__(self): - pass + return self.vocab.strings[self.c.case_] property conj_type_: def __get__(self): - pass + return self.vocab.strings[self.c.conj_type_] property connegative_: def __get__(self): - pass + return self.vocab.strings[self.c.connegative_] property definite_: def __get__(self): - pass + return self.vocab.strings[self.c.definite_] property degree_: def __get__(self): - pass + return self.vocab.strings[self.c.degree_] property derivation_: def __get__(self): - pass + return self.vocab.strings[self.c.derivation_] property echo_: def __get__(self): - pass + return self.vocab.strings[self.c.echo_] property foreign_: def __get__(self): - pass + return self.vocab.strings[self.c.foreign_] property gender_: def __get__(self): - pass + return self.vocab.strings[self.c.gender_] property hyph_: def __get__(self): - pass + return self.vocab.strings[self.c.hyph_] property inf_form_: def __get__(self): - pass + return self.vocab.strings[self.c.inf_form_] property name_type_: def __get__(self): - pass + return self.vocab.strings[self.c.name_type_] property negative_: def __get__(self): - pass + return self.vocab.strings[self.c.negative_] property mood_: def __get__(self): - pass + return self.vocab.strings[self.c.mood_] property number_: def __get__(self): - pass + return self.vocab.strings[self.c.number_] property num_form_: def __get__(self): - pass + return self.vocab.strings[self.c.num_form_] property num_type_: def __get__(self): - pass + return self.vocab.strings[self.c.num_type_] property num_value_: def __get__(self): - pass + return self.vocab.strings[self.c.num_value_] property part_form_: def __get__(self): - pass + return self.vocab.strings[self.c.part_form_] property part_type_: def __get__(self): - pass + return self.vocab.strings[self.c.part_type_] property person_: def __get__(self): - pass + return self.vocab.strings[self.c.person_] property polite_: def __get__(self): - pass + return self.vocab.strings[self.c.polite_] property polarity_: def __get__(self): - pass + return self.vocab.strings[self.c.polarity_] property poss_: def __get__(self): - pass + return self.vocab.strings[self.c.poss_] property prefix_: def __get__(self): - pass + return self.vocab.strings[self.c.prefix_] property prep_case_: def __get__(self): - pass + return self.vocab.strings[self.c.prep_case_] property pron_type_: def __get__(self): - pass + return self.vocab.strings[self.c.pron_type_] property punct_side_: def __get__(self): - pass + return self.vocab.strings[self.c.punct_side_] property punct_type_: def __get__(self): - pass + return self.vocab.strings[self.c.punct_type_] property reflex_: def __get__(self): - pass + return self.vocab.strings[self.c.reflex_] property style_: def __get__(self): - pass + return self.vocab.strings[self.c.style_] property style_variant_: def __get__(self): - pass + return self.vocab.strings[self.c.style_variant_] property tense_: def __get__(self): - pass + return self.vocab.strings[self.c.tense_] property typo_: def __get__(self): - pass + return self.vocab.strings[self.c.typo_] property verb_form_: def __get__(self): - pass + return self.vocab.strings[self.c.verb_form_] property voice_: def __get__(self): - pass + return self.vocab.strings[self.c.voice_] property verb_type_: def __get__(self): - pass + return self.vocab.strings[self.c.verb_type_] From b5f2b7b454604cf1fe6bdaf11f977cb57a8a9e11 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 8 Mar 2019 00:08:35 +0100 Subject: [PATCH 066/207] Add list_features() helper, clean up --- spacy/morphology.pxd | 4 +- spacy/morphology.pyx | 276 +++++++++++++++++++++++++++++-------------- 2 files changed, 188 insertions(+), 92 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 30c29c1c7..0001b9eb9 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -34,5 +34,7 @@ cdef class Morphology: cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil +cdef attr_t get_field(const MorphAnalysisC* tag, int field) nogil +cdef list list_features(const MorphAnalysisC* tag) -cdef tag_to_json(MorphAnalysisC tag) +cdef tag_to_json(const MorphAnalysisC* tag) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index d169c6d31..fa8245f47 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -174,7 +174,7 @@ cdef class Morphology: if tag == NULL: return [] else: - return tag_to_json(tag[0]) + return tag_to_json(tag) cpdef update(self, hash_t morph, features): """Update a morphological analysis with new feature values.""" @@ -296,7 +296,7 @@ cdef class Morphology: for key in self.tags: tag_ptr = <MorphAnalysisC*>self.tags.get(key) if tag_ptr != NULL: - json_tags.append(tag_to_json(tag_ptr[0])) + json_tags.append(tag_to_json(tag_ptr)) return srsly.json_dumps(json_tags) def from_bytes(self, byte_string): @@ -334,98 +334,186 @@ cdef MorphAnalysisC create_rich_tag(features) except *: return tag -cdef tag_to_json(MorphAnalysisC tag): - features = [] - if tag.abbr != 0: - features.append(FEATURE_NAMES[tag.abbr]) - if tag.adp_type != 0: - features.append(FEATURE_NAMES[tag.adp_type]) - if tag.adv_type != 0: - features.append(FEATURE_NAMES[tag.adv_type]) - if tag.animacy != 0: - features.append(FEATURE_NAMES[tag.animacy]) - if tag.aspect != 0: - features.append(FEATURE_NAMES[tag.aspect]) - if tag.case != 0: - features.append(FEATURE_NAMES[tag.case]) - if tag.conj_type != 0: - features.append(FEATURE_NAMES[tag.conj_type]) - if tag.connegative != 0: - features.append(FEATURE_NAMES[tag.connegative]) - if tag.definite != 0: - features.append(FEATURE_NAMES[tag.definite]) - if tag.degree != 0: - features.append(FEATURE_NAMES[tag.degree]) - if tag.derivation != 0: - features.append(FEATURE_NAMES[tag.derivation]) - if tag.echo != 0: - features.append(FEATURE_NAMES[tag.echo]) - if tag.foreign != 0: - features.append(FEATURE_NAMES[tag.foreign]) - if tag.gender != 0: - features.append(FEATURE_NAMES[tag.gender]) - if tag.hyph != 0: - features.append(FEATURE_NAMES[tag.hyph]) - if tag.inf_form != 0: - features.append(FEATURE_NAMES[tag.inf_form]) - if tag.mood != 0: - features.append(FEATURE_NAMES[tag.mood]) - if tag.negative != 0: - features.append(FEATURE_NAMES[tag.negative]) - if tag.number != 0: - features.append(FEATURE_NAMES[tag.number]) - if tag.name_type != 0: - features.append(FEATURE_NAMES[tag.name_type]) - if tag.noun_type != 0: - features.append(FEATURE_NAMES[tag.noun_type]) - if tag.num_form != 0: - features.append(FEATURE_NAMES[tag.num_form]) - if tag.num_type != 0: - features.append(FEATURE_NAMES[tag.num_type]) - if tag.num_value != 0: - features.append(FEATURE_NAMES[tag.num_value]) - if tag.part_form != 0: - features.append(FEATURE_NAMES[tag.part_form]) - if tag.part_type != 0: - features.append(FEATURE_NAMES[tag.part_type]) - if tag.person != 0: - features.append(FEATURE_NAMES[tag.person]) - if tag.polite != 0: - features.append(FEATURE_NAMES[tag.polite]) - if tag.polarity != 0: - features.append(FEATURE_NAMES[tag.polarity]) - if tag.poss != 0: - features.append(FEATURE_NAMES[tag.poss]) - if tag.prefix != 0: - features.append(FEATURE_NAMES[tag.prefix]) - if tag.prep_case != 0: - features.append(FEATURE_NAMES[tag.prep_case]) - if tag.pron_type != 0: - features.append(FEATURE_NAMES[tag.pron_type]) - if tag.punct_side != 0: - features.append(FEATURE_NAMES[tag.punct_side]) - if tag.punct_type != 0: - features.append(FEATURE_NAMES[tag.punct_type]) - if tag.reflex != 0: - features.append(FEATURE_NAMES[tag.reflex]) - if tag.style != 0: - features.append(FEATURE_NAMES[tag.style]) - if tag.style_variant != 0: - features.append(FEATURE_NAMES[tag.style_variant]) - if tag.tense != 0: - features.append(FEATURE_NAMES[tag.tense]) - if tag.verb_form != 0: - features.append(FEATURE_NAMES[tag.verb_form]) - if tag.voice != 0: - features.append(FEATURE_NAMES[tag.voice]) - if tag.verb_type != 0: - features.append(FEATURE_NAMES[tag.verb_type]) - return features +cdef tag_to_json(const MorphAnalysisC* tag): + return [FEATURE_NAMES[f] for f in list_features(tag)] cdef MorphAnalysisC tag_from_json(json_tag): - cdef MorphAnalysisC tag - return tag + raise NotImplementedError + + +cdef list list_features(const MorphAnalysisC* tag): + output = [] + if tag.abbr != 0: + output.append(tag.abbr) + if tag.adp_type != 0: + output.append(tag.adp_type) + if tag.adv_type != 0: + output.append(tag.adv_type) + if tag.animacy != 0: + output.append(tag.animacy) + if tag.aspect != 0: + output.append(tag.aspect) + if tag.case != 0: + output.append(tag.case) + if tag.conj_type != 0: + output.append(tag.conj_type) + if tag.connegative != 0: + output.append(tag.connegative) + if tag.definite != 0: + output.append(tag.definite) + if tag.degree != 0: + output.append(tag.degree) + if tag.derivation != 0: + output.append(tag.derivation) + if tag.echo != 0: + output.append(tag.echo) + if tag.foreign != 0: + output.append(tag.foreign) + if tag.gender != 0: + output.append(tag.gender) + if tag.hyph != 0: + output.append(tag.hyph) + if tag.inf_form != 0: + output.append(tag.inf_form) + if tag.mood != 0: + output.append(tag.mood) + if tag.negative != 0: + output.append(tag.negative) + if tag.number != 0: + output.append(tag.number) + if tag.name_type != 0: + output.append(tag.name_type) + if tag.noun_type != 0: + output.append(tag.noun_type) + if tag.part_form != 0: + output.append(tag.part_form) + if tag.part_type != 0: + output.append(tag.part_type) + if tag.person != 0: + output.append(tag.person) + if tag.polite != 0: + output.append(tag.polite) + if tag.polarity != 0: + output.append(tag.polarity) + if tag.poss != 0: + output.append(tag.poss) + if tag.prefix != 0: + output.append(tag.prefix) + if tag.prep_case != 0: + output.append(tag.prep_case) + if tag.pron_type != 0: + output.append(tag.pron_type) + if tag.punct_type != 0: + output.append(tag.punct_type) + if tag.reflex != 0: + output.append(tag.reflex) + if tag.style != 0: + output.append(tag.style) + if tag.style_variant != 0: + output.append(tag.style_variant) + if tag.typo != 0: + output.append(tag.typo) + if tag.verb_form != 0: + output.append(tag.verb_form) + if tag.voice != 0: + output.append(tag.voice) + if tag.verb_type != 0: + output.append(tag.verb_type) + return output + + +cdef attr_t get_field(const MorphAnalysisC* tag, int field_id) nogil: + field = <univ_field_t>field_id + if field == Field_Abbr: + return tag.abbr + elif field == Field_AdpType: + return tag.adp_type + elif field == Field_AdvType: + return tag.adv_type + elif field == Field_Animacy: + return tag.animacy + elif field == Field_Aspect: + return tag.aspect + elif field == Field_Case: + return tag.case + elif field == Field_ConjType: + return tag.conj_type + elif field == Field_Connegative: + return tag.connegative + elif field == Field_Definite: + return tag.definite + elif field == Field_Degree: + return tag.degree + elif field == Field_Derivation: + return tag.derivation + elif field == Field_Echo: + return tag.echo + elif field == Field_Foreign: + return tag.foreign + elif field == Field_Gender: + return tag.gender + elif field == Field_Hyph: + return tag.hyph + elif field == Field_InfForm: + return tag.inf_form + elif field == Field_Mood: + return tag.mood + elif field == Field_Negative: + return tag.negative + elif field == Field_Number: + return tag.number + elif field == Field_NameType: + return tag.name_type + elif field == Field_NounType: + return tag.noun_type + elif field == Field_NumForm: + return tag.num_form + elif field == Field_NumType: + return tag.num_type + elif field == Field_NumValue: + return tag.num_value + elif field == Field_PartForm: + return tag.part_form + elif field == Field_PartType: + return tag.part_type + elif field == Field_Person: + return tag.person + elif field == Field_Polite: + return tag.polite + elif field == Field_Polarity: + return tag.polarity + elif field == Field_Poss: + return tag.poss + elif field == Field_Prefix: + return tag.prefix + elif field == Field_PrepCase: + return tag.prep_case + elif field == Field_PronType: + return tag.pron_type + elif field == Field_PunctSide: + return tag.punct_side + elif field == Field_PunctType: + return tag.punct_type + elif field == Field_Reflex: + return tag.reflex + elif field == Field_Style: + return tag.style + elif field == Field_StyleVariant: + return tag.style_variant + elif field == Field_Tense: + return tag.tense + elif field == Field_Typo: + return tag.typo + elif field == Field_VerbForm: + return tag.verb_form + elif field == Field_Voice: + return tag.voice + elif field == Field_VerbType: + return tag.verb_type + else: + raise ValueError("Unknown feature: %s (%d)" % (FEATURE_NAMES.get(feature), feature)) + cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil: @@ -524,6 +612,11 @@ cdef int set_feature(MorphAnalysisC* tag, value_ = feature else: value_ = 0 + prev_value = get_field(tag, field) + if prev_value != 0 and value_ == 0: + tag.length -= 1 + elif prev_value == 0 and value_ != 0: + tag.length += 1 if feature == 0: pass elif field == Field_Abbr: @@ -616,6 +709,7 @@ cdef int set_feature(MorphAnalysisC* tag, raise ValueError("Unknown feature: %s (%d)" % (FEATURE_NAMES.get(feature), feature)) + FIELDS = { 'Abbr': Field_Abbr, 'AdpType': Field_AdpType, From 9a2d1cc6e05f9d6fbd431b99d9f46d786db4b153 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 8 Mar 2019 00:08:57 +0100 Subject: [PATCH 067/207] Add length attribute to MorphAnalysisC --- spacy/structs.pxd | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/structs.pxd b/spacy/structs.pxd index a4daa9b94..bf7dc0d7a 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -75,6 +75,7 @@ cdef struct TokenC: cdef struct MorphAnalysisC: univ_pos_t pos + int length attr_t abbr attr_t adp_type From 3300e3d7abbd22f09f55104aba46fa47dbd52054 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 8 Mar 2019 00:09:16 +0100 Subject: [PATCH 068/207] Implement more MorphAnalysis API --- spacy/tokens/morphanalysis.pyx | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index 59ce6daa0..c9d915599 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -2,7 +2,7 @@ from libc.string cimport memset from ..vocab cimport Vocab from ..typedefs cimport hash_t, attr_t -from ..morphology cimport check_feature, tag_to_json +from ..morphology cimport list_features, check_feature, tag_to_json from ..strings import get_string_id @@ -21,6 +21,7 @@ cdef class MorphAnalysis: @classmethod def from_id(cls, Vocab vocab, hash_t key): cdef MorphAnalysis morph = MorphAnalysis.__new__(MorphAnalysis, vocab) + morph.vocab = vocab morph.key = key analysis = <const MorphAnalysisC*>vocab.morphology.tags.get(key) if analysis is not NULL: @@ -34,25 +35,27 @@ cdef class MorphAnalysis: return check_feature(&self.c, feat_id) def __iter__(self): - raise NotImplementedError + cdef attr_t feature + for feature in list_features(&self.c): + yield self.vocab.strings[feature] def __len__(self): - raise NotImplementedError + return self.c.length def __str__(self): - raise NotImplementedError + return self.to_json() def __repr__(self): - raise NotImplementedError + return self.to_json() def __hash__(self): - raise NotImplementedError + return self.key def get(self, field): raise NotImplementedError def to_json(self): - return tag_to_json(self.c) + return tag_to_json(&self.c) @property def is_base_form(self): From 3c3259024310e3ed28bc71c8bb6fa60d608a049c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 8 Mar 2019 00:10:07 +0100 Subject: [PATCH 069/207] Add test for morph analysis --- spacy/tests/doc/test_morphanalysis.py | 30 +++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 spacy/tests/doc/test_morphanalysis.py diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py new file mode 100644 index 000000000..31c765e32 --- /dev/null +++ b/spacy/tests/doc/test_morphanalysis.py @@ -0,0 +1,30 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +import numpy +from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STOP +from spacy.symbols import VERB +from spacy.vocab import Vocab +from spacy.tokens import Doc + +@pytest.fixture +def i_has(en_tokenizer): + doc = en_tokenizer("I has") + doc[0].tag_ = "PRP" + doc[1].tag_ = "VBZ" + return doc + +def test_token_morph_id(i_has): + assert i_has[0].morph.id + assert i_has[1].morph.id != 0 + assert i_has[0].morph.id != i_has[1].morph.id + +def test_morph_props(i_has): + assert i_has[0].morph.pron_type == i_has.vocab.strings["PronType_prs"] + assert i_has[1].morph.pron_type == 0 + + +def test_morph_iter(i_has): + assert list(i_has[0].morph) == ["PronType_prs"] + assert list(i_has[1].morph) == ["Number_sing", "Person_three", "VerbForm_fin"] From 322b64dca08964f97c1d618012921b89dcc1f355 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 8 Mar 2019 01:38:15 +0100 Subject: [PATCH 070/207] Allow lookup of morphology by attribute name --- spacy/morphology.pxd | 1 + spacy/morphology.pyx | 52 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 0001b9eb9..cb708166c 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -36,5 +36,6 @@ cdef class Morphology: cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil cdef attr_t get_field(const MorphAnalysisC* tag, int field) nogil cdef list list_features(const MorphAnalysisC* tag) +cdef int attribute_to_field(unicode attribute) cdef tag_to_json(const MorphAnalysisC* tag) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index fa8245f47..63d0291ff 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -98,6 +98,10 @@ def parse_feature(feature): return (field, offset) +cdef int attribute_to_field(unicode attribute_name): + return LOWER_FIELDS[attribute_name] + + def get_field_id(feature): return FEATURE_FIELDS[feature] @@ -709,7 +713,6 @@ cdef int set_feature(MorphAnalysisC* tag, raise ValueError("Unknown feature: %s (%d)" % (FEATURE_NAMES.get(feature), feature)) - FIELDS = { 'Abbr': Field_Abbr, 'AdpType': Field_AdpType, @@ -756,6 +759,53 @@ FIELDS = { 'VerbType': Field_VerbType } +LOWER_FIELDS = { + 'abbr': Field_Abbr, + 'adp_type': Field_AdpType, + 'adv_type': Field_AdvType, + 'animacy': Field_Animacy, + 'aspect': Field_Aspect, + 'case': Field_Case, + 'conj_type': Field_ConjType, + 'connegative': Field_Connegative, + 'definite': Field_Definite, + 'degree': Field_Degree, + 'derivation': Field_Derivation, + 'echo': Field_Echo, + 'foreign': Field_Foreign, + 'gender': Field_Gender, + 'hyph': Field_Hyph, + 'inf_form': Field_InfForm, + 'mood': Field_Mood, + 'name_type': Field_NameType, + 'negative': Field_Negative, + 'noun_type': Field_NounType, + 'number': Field_Number, + 'num_form': Field_NumForm, + 'num_type': Field_NumType, + 'num_value': Field_NumValue, + 'part_form': Field_PartForm, + 'part_type': Field_PartType, + 'person': Field_Person, + 'polite': Field_Polite, + 'polarity': Field_Polarity, + 'poss': Field_Poss, + 'prefix': Field_Prefix, + 'prep_case': Field_PrepCase, + 'pron_type': Field_PronType, + 'punct_side': Field_PunctSide, + 'punct_type': Field_PunctType, + 'reflex': Field_Reflex, + 'style': Field_Style, + 'style_variant': Field_StyleVariant, + 'tense': Field_Tense, + 'typo': Field_Typo, + 'verb_form': Field_VerbForm, + 'voice': Field_Voice, + 'verb_type': Field_VerbType +} + + FEATURES = [ "Abbr_yes", "AdpType_circ", From 9dceb97570812e5e82d380f5948a472f53e04699 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 8 Mar 2019 01:38:34 +0100 Subject: [PATCH 071/207] Extend morphanalysis API --- spacy/tokens/morphanalysis.pyx | 92 +++++++++++++++++----------------- 1 file changed, 47 insertions(+), 45 deletions(-) diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index c9d915599..b727e2c3f 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -2,7 +2,8 @@ from libc.string cimport memset from ..vocab cimport Vocab from ..typedefs cimport hash_t, attr_t -from ..morphology cimport list_features, check_feature, tag_to_json +from ..morphology cimport list_features, check_feature, get_field, tag_to_json +from ..morphology cimport attribute_to_field from ..strings import get_string_id @@ -51,8 +52,9 @@ cdef class MorphAnalysis: def __hash__(self): return self.key - def get(self, field): - raise NotImplementedError + def get(self, unicode field): + cdef int field_id = attribute_to_field(field) + return self.vocab.strings[get_field(&self.c, field_id)] def to_json(self): return tag_to_json(&self.c) @@ -247,168 +249,168 @@ cdef class MorphAnalysis: property abbr_: def __get__(self): - return self.vocab.strings[self.c.abbr_] + return self.vocab.strings[self.c.abbr] property adp_type_: def __get__(self): - return self.vocab.strings[self.c.adp_type_] + return self.vocab.strings[self.c.adp_type] property adv_type_: def __get__(self): - return self.vocab.strings[self.c.adv_type_] + return self.vocab.strings[self.c.adv_type] property animacy_: def __get__(self): - return self.vocab.strings[self.c.animacy_] + return self.vocab.strings[self.c.animacy] property aspect_: def __get__(self): - return self.vocab.strings[self.c.aspect_] + return self.vocab.strings[self.c.aspect] property case_: def __get__(self): - return self.vocab.strings[self.c.case_] + return self.vocab.strings[self.c.case] property conj_type_: def __get__(self): - return self.vocab.strings[self.c.conj_type_] + return self.vocab.strings[self.c.conj_type] property connegative_: def __get__(self): - return self.vocab.strings[self.c.connegative_] + return self.vocab.strings[self.c.connegative] property definite_: def __get__(self): - return self.vocab.strings[self.c.definite_] + return self.vocab.strings[self.c.definite] property degree_: def __get__(self): - return self.vocab.strings[self.c.degree_] + return self.vocab.strings[self.c.degree] property derivation_: def __get__(self): - return self.vocab.strings[self.c.derivation_] + return self.vocab.strings[self.c.derivation] property echo_: def __get__(self): - return self.vocab.strings[self.c.echo_] + return self.vocab.strings[self.c.echo] property foreign_: def __get__(self): - return self.vocab.strings[self.c.foreign_] + return self.vocab.strings[self.c.foreign] property gender_: def __get__(self): - return self.vocab.strings[self.c.gender_] + return self.vocab.strings[self.c.gender] property hyph_: def __get__(self): - return self.vocab.strings[self.c.hyph_] + return self.vocab.strings[self.c.hyph] property inf_form_: def __get__(self): - return self.vocab.strings[self.c.inf_form_] + return self.vocab.strings[self.c.inf_form] property name_type_: def __get__(self): - return self.vocab.strings[self.c.name_type_] + return self.vocab.strings[self.c.name_type] property negative_: def __get__(self): - return self.vocab.strings[self.c.negative_] + return self.vocab.strings[self.c.negative] property mood_: def __get__(self): - return self.vocab.strings[self.c.mood_] + return self.vocab.strings[self.c.mood] property number_: def __get__(self): - return self.vocab.strings[self.c.number_] + return self.vocab.strings[self.c.number] property num_form_: def __get__(self): - return self.vocab.strings[self.c.num_form_] + return self.vocab.strings[self.c.num_form] property num_type_: def __get__(self): - return self.vocab.strings[self.c.num_type_] + return self.vocab.strings[self.c.num_type] property num_value_: def __get__(self): - return self.vocab.strings[self.c.num_value_] + return self.vocab.strings[self.c.num_value] property part_form_: def __get__(self): - return self.vocab.strings[self.c.part_form_] + return self.vocab.strings[self.c.part_form] property part_type_: def __get__(self): - return self.vocab.strings[self.c.part_type_] + return self.vocab.strings[self.c.part_type] property person_: def __get__(self): - return self.vocab.strings[self.c.person_] + return self.vocab.strings[self.c.person] property polite_: def __get__(self): - return self.vocab.strings[self.c.polite_] + return self.vocab.strings[self.c.polite] property polarity_: def __get__(self): - return self.vocab.strings[self.c.polarity_] + return self.vocab.strings[self.c.polarity] property poss_: def __get__(self): - return self.vocab.strings[self.c.poss_] + return self.vocab.strings[self.c.poss] property prefix_: def __get__(self): - return self.vocab.strings[self.c.prefix_] + return self.vocab.strings[self.c.prefix] property prep_case_: def __get__(self): - return self.vocab.strings[self.c.prep_case_] + return self.vocab.strings[self.c.prep_case] property pron_type_: def __get__(self): - return self.vocab.strings[self.c.pron_type_] + return self.vocab.strings[self.c.pron_type] property punct_side_: def __get__(self): - return self.vocab.strings[self.c.punct_side_] + return self.vocab.strings[self.c.punct_side] property punct_type_: def __get__(self): - return self.vocab.strings[self.c.punct_type_] + return self.vocab.strings[self.c.punct_type] property reflex_: def __get__(self): - return self.vocab.strings[self.c.reflex_] + return self.vocab.strings[self.c.reflex] property style_: def __get__(self): - return self.vocab.strings[self.c.style_] + return self.vocab.strings[self.c.style] property style_variant_: def __get__(self): - return self.vocab.strings[self.c.style_variant_] + return self.vocab.strings[self.c.style_variant] property tense_: def __get__(self): - return self.vocab.strings[self.c.tense_] + return self.vocab.strings[self.c.tense] property typo_: def __get__(self): - return self.vocab.strings[self.c.typo_] + return self.vocab.strings[self.c.typo] property verb_form_: def __get__(self): - return self.vocab.strings[self.c.verb_form_] + return self.vocab.strings[self.c.verb_form] property voice_: def __get__(self): - return self.vocab.strings[self.c.voice_] + return self.vocab.strings[self.c.voice] property verb_type_: def __get__(self): - return self.vocab.strings[self.c.verb_type_] + return self.vocab.strings[self.c.verb_type] From 19e6b39786964d5c4c401fe9bf8c03dacda11dd5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 8 Mar 2019 01:38:54 +0100 Subject: [PATCH 072/207] Test morphological features --- spacy/tests/doc/test_morphanalysis.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index 31c765e32..ffee5694a 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -22,9 +22,14 @@ def test_token_morph_id(i_has): def test_morph_props(i_has): assert i_has[0].morph.pron_type == i_has.vocab.strings["PronType_prs"] + assert i_has[0].morph.pron_type_ == "PronType_prs" assert i_has[1].morph.pron_type == 0 def test_morph_iter(i_has): assert list(i_has[0].morph) == ["PronType_prs"] assert list(i_has[1].morph) == ["Number_sing", "Person_three", "VerbForm_fin"] + + +def test_morph_get(i_has): + assert i_has[0].morph.get("pron_type") == "PronType_prs" From ad834be494f8074e93a6cb9fb76e6215b4e35307 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Fri, 8 Mar 2019 13:28:53 +0100 Subject: [PATCH 073/207] Tidy up and auto-format --- spacy/_ml.py | 81 +++++++++---------- spacy/errors.py | 1 - spacy/tests/doc/test_morphanalysis.py | 8 +- spacy/tests/morphology/test_morph_features.py | 29 ++++--- 4 files changed, 61 insertions(+), 58 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 422bbe66a..68dedc0b3 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -465,17 +465,16 @@ def getitem(i): @describe.attributes( - W=Synapses("Weights matrix", - lambda obj: (obj.nO, obj.nI), - lambda W, ops: None) + W=Synapses("Weights matrix", lambda obj: (obj.nO, obj.nI), lambda W, ops: None) ) class MultiSoftmax(Affine): - '''Neural network layer that predicts several multi-class attributes at once. + """Neural network layer that predicts several multi-class attributes at once. For instance, we might predict one class with 6 variables, and another with 5. We predict the 11 neurons required for this, and then softmax them such that columns 0-6 make a probability distribution and coumns 6-11 make another. - ''' - name = 'multisoftmax' + """ + + name = "multisoftmax" def __init__(self, out_sizes, nI=None, **kwargs): Model.__init__(self, **kwargs) @@ -487,12 +486,13 @@ class MultiSoftmax(Affine): output__BO = self.ops.affine(self.W, self.b, input__BI) i = 0 for out_size in self.out_sizes: - self.ops.softmax(output__BO[:, i : i+out_size], inplace=True) + self.ops.softmax(output__BO[:, i : i + out_size], inplace=True) i += out_size return output__BO - def begin_update(self, input__BI, drop=0.): + def begin_update(self, input__BI, drop=0.0): output__BO = self.predict(input__BI) + def finish_update(grad__BO, sgd=None): self.d_W += self.ops.gemm(grad__BO, input__BI, trans1=True) self.d_b += grad__BO.sum(axis=0) @@ -500,6 +500,7 @@ class MultiSoftmax(Affine): if sgd is not None: sgd(self._mem.weights, self._mem.gradient, key=self.id) return grad__BI + return output__BO, finish_update @@ -515,41 +516,41 @@ def build_tagger_model(nr_class, **cfg): if "tok2vec" in cfg: tok2vec = cfg["tok2vec"] else: - tok2vec = Tok2Vec(token_vector_width, embed_size, - subword_features=subword_features, - pretrained_vectors=pretrained_vectors) - softmax = with_flatten( - Softmax(nr_class, token_vector_width)) - model = ( - tok2vec - >> softmax - ) + tok2vec = Tok2Vec( + token_vector_width, + embed_size, + subword_features=subword_features, + pretrained_vectors=pretrained_vectors, + ) + softmax = with_flatten(Softmax(nr_class, token_vector_width)) + model = tok2vec >> softmax model.nI = None model.tok2vec = tok2vec model.softmax = softmax return model + def build_morphologizer_model(class_nums, **cfg): - embed_size = util.env_opt('embed_size', 7000) - if 'token_vector_width' in cfg: - token_vector_width = cfg['token_vector_width'] + embed_size = util.env_opt("embed_size", 7000) + if "token_vector_width" in cfg: + token_vector_width = cfg["token_vector_width"] else: - token_vector_width = util.env_opt('token_vector_width', 128) - pretrained_vectors = cfg.get('pretrained_vectors') - subword_features = cfg.get('subword_features', True) - with Model.define_operators({'>>': chain, '+': add}): - if 'tok2vec' in cfg: - tok2vec = cfg['tok2vec'] + token_vector_width = util.env_opt("token_vector_width", 128) + pretrained_vectors = cfg.get("pretrained_vectors") + subword_features = cfg.get("subword_features", True) + with Model.define_operators({">>": chain, "+": add}): + if "tok2vec" in cfg: + tok2vec = cfg["tok2vec"] else: - tok2vec = Tok2Vec(token_vector_width, embed_size, - subword_features=subword_features, - pretrained_vectors=pretrained_vectors) + tok2vec = Tok2Vec( + token_vector_width, + embed_size, + subword_features=subword_features, + pretrained_vectors=pretrained_vectors, + ) softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width)) softmax.out_sizes = class_nums - model = ( - tok2vec - >> softmax - ) + model = tok2vec >> softmax model.nI = None model.tok2vec = tok2vec model.softmax = softmax @@ -630,17 +631,13 @@ def build_text_classifier(nr_class, width=64, **cfg): ) linear_model = _preprocess_doc >> LinearModel(nr_class) - if cfg.get('exclusive_classes'): + if cfg.get("exclusive_classes"): output_layer = Softmax(nr_class, nr_class * 2) else: output_layer = ( - zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) - >> logistic + zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic ) - model = ( - (linear_model | cnn_model) - >> output_layer - ) + model = (linear_model | cnn_model) >> output_layer model.tok2vec = chain(tok2vec, flatten) model.nO = nr_class model.lsuv = False @@ -658,7 +655,9 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, if exclusive_classes: output_layer = Softmax(nr_class, tok2vec.nO) else: - output_layer = zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic + output_layer = ( + zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic + ) model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer model.tok2vec = chain(tok2vec, flatten) model.nO = nr_class diff --git a/spacy/errors.py b/spacy/errors.py index 13382d146..f9dd8535e 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -350,7 +350,6 @@ class Errors(object): "is likely a bug in spaCy.") - @add_codes class TempErrors(object): T003 = ("Resizing pre-trained Tagger models is not currently supported.") diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index ffee5694a..5d570af53 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -2,11 +2,7 @@ from __future__ import unicode_literals import pytest -import numpy -from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STOP -from spacy.symbols import VERB -from spacy.vocab import Vocab -from spacy.tokens import Doc + @pytest.fixture def i_has(en_tokenizer): @@ -15,11 +11,13 @@ def i_has(en_tokenizer): doc[1].tag_ = "VBZ" return doc + def test_token_morph_id(i_has): assert i_has[0].morph.id assert i_has[1].morph.id != 0 assert i_has[0].morph.id != i_has[1].morph.id + def test_morph_props(i_has): assert i_has[0].morph.pron_type == i_has.vocab.strings["PronType_prs"] assert i_has[0].morph.pron_type_ == "PronType_prs" diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py index dcb0b32ff..4b8f0d754 100644 --- a/spacy/tests/morphology/test_morph_features.py +++ b/spacy/tests/morphology/test_morph_features.py @@ -1,41 +1,48 @@ +# coding: utf-8 from __future__ import unicode_literals -import pytest -from ...morphology import Morphology -from ...strings import StringStore, get_string_id -from ...lemmatizer import Lemmatizer -from ...morphology import * +import pytest +from spacy.morphology import Morphology +from spacy.strings import StringStore, get_string_id +from spacy.lemmatizer import Lemmatizer + @pytest.fixture def morphology(): return Morphology(StringStore(), {}, Lemmatizer()) + def test_init(morphology): pass + def test_add_morphology_with_string_names(morphology): morphology.add({"Case_gen", "Number_sing"}) + def test_add_morphology_with_int_ids(morphology): morphology.add({get_string_id("Case_gen"), get_string_id("Number_sing")}) + def test_add_morphology_with_mix_strings_and_ints(morphology): - morphology.add({get_string_id("PunctSide_ini"), 'VerbType_aux'}) + morphology.add({get_string_id("PunctSide_ini"), "VerbType_aux"}) def test_morphology_tags_hash_distinctly(morphology): - tag1 = morphology.add({"PunctSide_ini", 'VerbType_aux'}) - tag2 = morphology.add({"Case_gen", 'Number_sing'}) + tag1 = morphology.add({"PunctSide_ini", "VerbType_aux"}) + tag2 = morphology.add({"Case_gen", "Number_sing"}) assert tag1 != tag2 + def test_morphology_tags_hash_independent_of_order(morphology): - tag1 = morphology.add({"Case_gen", 'Number_sing'}) - tag2 = morphology.add({"Number_sing", "Case_gen"}) + tag1 = morphology.add({"Case_gen", "Number_sing"}) + tag2 = morphology.add({"Number_sing", "Case_gen"}) assert tag1 == tag2 + def test_update_morphology_tag(morphology): tag1 = morphology.add({"Case_gen"}) tag2 = morphology.update(tag1, {"Number_sing"}) assert tag1 != tag2 - tag3 = morphology.add({"Number_sing", "Case_gen"}) + tag3 = morphology.add({"Number_sing", "Case_gen"}) assert tag2 == tag3 From 3908911da477a85073eb8ac607bc9ef7f60ccd7e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 8 Mar 2019 17:04:14 +0100 Subject: [PATCH 074/207] Fix import --- spacy/pipeline/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 686743a6a..83c0c65d0 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -1,8 +1,9 @@ # coding: utf8 from __future__ import unicode_literals -from .pipes import Tagger, DependencyParser, EntityRecognizer, Morphologizer +from .pipes import Tagger, DependencyParser, EntityRecognizer from .pipes import TextCategorizer, Tensorizer, Pipe +from .morphologizer import Morphologizer from .entityruler import EntityRuler from .hooks import SentenceSegmenter, SimilarityHook from .functions import merge_entities, merge_noun_chunks, merge_subtokens From d7ec1d62cb6c711d94c440492aa329047f6068d4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 8 Mar 2019 18:54:25 +0100 Subject: [PATCH 075/207] Fix Morphologizer --- spacy/cli/ud/ud_train.py | 2 +- spacy/morphology.pyx | 8 +++++--- spacy/pipeline/morphologizer.pyx | 14 +++++++++++--- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/spacy/cli/ud/ud_train.py b/spacy/cli/ud/ud_train.py index afef6c073..44ecababe 100644 --- a/spacy/cli/ud/ud_train.py +++ b/spacy/cli/ud/ud_train.py @@ -299,7 +299,7 @@ def get_token_conllu(token, i): head = 0 else: head = i + (token.head.i - token.i) + 1 - features = token.vocab.morphology.get(token.morph_key) + features = list(token.morph) feat_str = [] replacements = {"one": "1", "two": "2", "three": "3"} for feat in features: diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 63d0291ff..2f3e8d1fa 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -107,11 +107,11 @@ def get_field_id(feature): def get_field_size(field): - return FIELD_SIZES[field] + return FIELD_SIZES[FIELDS[field]] def get_field_offset(field): - return FIELD_OFFSETS[field] + return FIELD_OFFSETS[FIELDS[]]]]field] cdef class Morphology: @@ -831,6 +831,8 @@ FEATURES = [ "Aspect_mod", "Aspect_none", "Aspect_perf", + "Aspect_prof", + "Aspect_prosp", "Case_abe", "Case_abl", "Case_abs", @@ -1074,6 +1076,6 @@ _seen_fields = Counter() for i, feature in enumerate(FEATURES): field = FEATURE_FIELDS[feature] FEATURE_OFFSETS[feature] = _seen_fields[field] - if _seen_fields == 0: + if _seen_fields[field] == 0: FIELD_OFFSETS[field] = i _seen_fields[field] += 1 diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 9f25ba357..223bb6ec5 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -81,16 +81,18 @@ class Morphologizer(Pipe): doc_scores = batch_scores[i] doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes) # Convert the neuron indices into feature IDs. - doc_feat_ids = self.model.ops.allocate((len(doc), len(field_names)), dtype='i') + doc_feat_ids = numpy.zeros((len(doc), len(field_names)), dtype='i') for j in range(len(doc)): for k, offset in enumerate(offsets): if doc_guesses[j, k] == 0: doc_feat_ids[j, k] = 0 else: doc_feat_ids[j, k] = offset + doc_guesses[j, k] + # Get the set of feature names. + feats = {FEATURES[f] for f in doc_feat_ids[j] if f != 0} # Now add the analysis, and set the hash. try: - doc.c[j].morph = self.vocab.morphology.add(doc_feat_ids[j]) + doc.c[j].morph = self.vocab.morphology.add(feats) except: print(offsets) print(doc_guesses[j]) @@ -114,7 +116,12 @@ class Morphologizer(Pipe): guesses.append(scores_to_guesses(doc_scores, self.model.softmax.out_sizes)) guesses = self.model.ops.xp.vstack(guesses) scores = self.model.ops.xp.vstack(scores) + if not isinstance(scores, numpy.ndarray): + scores = scores.get() + if not isinstance(guesses, numpy.ndarray): + guesses = guesses.get() cdef int idx = 0 + # Do this on CPU, as we can't vectorize easily. target = numpy.zeros(scores.shape, dtype='f') field_sizes = self.model.softmax.out_sizes for gold in golds: @@ -134,7 +141,8 @@ class Morphologizer(Pipe): target[idx, col_offset] = 1. col_offset += field_size idx += 1 - target = self.model.ops.xp.array(target, dtype='f') + target = self.model.ops.asarray(target, dtype='f') + scores = self.model.ops.asarray(scores, dtype='f') d_scores = scores - target loss = (d_scores**2).sum() d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) From 09b26f5e2e72923fbc4b8d14dbdb0a51d0793931 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 8 Mar 2019 18:58:26 +0100 Subject: [PATCH 076/207] Fix compile error --- spacy/morphology.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 2f3e8d1fa..cead0756b 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -111,7 +111,7 @@ def get_field_size(field): def get_field_offset(field): - return FIELD_OFFSETS[FIELDS[]]]]field] + return FIELD_OFFSETS[FIELDS[field]] cdef class Morphology: From 49cf002ac440cd07c3d41ba5f318a5441fa60394 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 8 Mar 2019 18:59:25 +0100 Subject: [PATCH 077/207] Add missing import --- spacy/pipeline/morphologizer.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 223bb6ec5..7d0ad42cf 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -17,6 +17,7 @@ from ..tokens.doc cimport Doc from ..vocab cimport Vocab from ..morphology cimport Morphology from ..morphology import get_field_size, get_field_offset, parse_feature, FIELDS +from ..morphology import FEATURES class Morphologizer(Pipe): From c91577db028e3343e7280f0614f7bd89451f93f0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 8 Mar 2019 19:03:17 +0100 Subject: [PATCH 078/207] Add set_morphology cfg option for Tagger --- spacy/pipeline/pipes.pyx | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index b3c3db04d..237d36a12 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -357,6 +357,14 @@ class Tagger(Pipe): self.cfg = OrderedDict(sorted(cfg.items())) self.cfg.setdefault("cnn_maxout_pieces", 2) + @property + def set_morphology(self): + return self.cfg.get("set_morphology", True) + + @property.setter + def set_morphology(self, value): + return self.cfg["set_morphology"] = True + @property def labels(self): return tuple(self.vocab.morphology.tag_names) @@ -410,12 +418,13 @@ class Tagger(Pipe): doc_tag_ids = doc_tag_ids.get() for j, tag_id in enumerate(doc_tag_ids): # Don't clobber preset POS tags - if doc.c[j].tag == 0 and doc.c[j].pos == 0: - # Don't clobber preset lemmas - lemma = doc.c[j].lemma - vocab.morphology.assign_tag_id(&doc.c[j], tag_id) - if lemma != 0 and lemma != doc.c[j].lex.orth: - doc.c[j].lemma = lemma + if doc.c[j].tag == 0: + if doc.c[j].pos == 0 and self.set_morphology: + # Don't clobber preset lemmas + lemma = doc.c[j].lemma + vocab.morphology.assign_tag_id(&doc.c[j], tag_id) + if lemma != 0 and lemma != doc.c[j].lex.orth: + doc.c[j].lemma = lemma idx += 1 if tensors is not None and len(tensors): if isinstance(doc.tensor, numpy.ndarray) \ From 27886d626f37eddca95a84b3f05138bfa3a56c8f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 8 Mar 2019 19:03:31 +0100 Subject: [PATCH 079/207] Dont set morphology in Tagger for ud_train --- spacy/cli/ud/ud_train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/cli/ud/ud_train.py b/spacy/cli/ud/ud_train.py index 44ecababe..f172c8d78 100644 --- a/spacy/cli/ud/ud_train.py +++ b/spacy/cli/ud/ud_train.py @@ -342,9 +342,10 @@ def load_nlp(corpus, config, vectors=None): def initialize_pipeline(nlp, docs, golds, config, device): - nlp.add_pipe(nlp.create_pipe("tagger")) + nlp.add_pipe(nlp.create_pipe("tagger", set_morphology=False)) nlp.add_pipe(nlp.create_pipe("morphologizer")) nlp.add_pipe(nlp.create_pipe("parser")) + assert not nlp.get_pipe("tagger").set_morphology if config.multitask_tag: nlp.parser.add_multitask_objective("tag") if config.multitask_sent: From b27bd42613aa98a9a9625f6b44b98fce94f630f2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 8 Mar 2019 19:06:02 +0100 Subject: [PATCH 080/207] Fix compile error --- spacy/pipeline/pipes.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 237d36a12..1d428731e 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -363,7 +363,7 @@ class Tagger(Pipe): @property.setter def set_morphology(self, value): - return self.cfg["set_morphology"] = True + self.cfg["set_morphology"] = True @property def labels(self): From afa227e25b64213de7ee465c2a76b5fedc2985ef Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 8 Mar 2019 19:10:01 +0100 Subject: [PATCH 081/207] Fix setter --- spacy/pipeline/pipes.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 1d428731e..52cfe0b44 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -361,9 +361,9 @@ class Tagger(Pipe): def set_morphology(self): return self.cfg.get("set_morphology", True) - @property.setter + @set_morphology.setter def set_morphology(self, value): - self.cfg["set_morphology"] = True + self.cfg["set_morphology"] = value @property def labels(self): From cc2b2dba146d1bb51d6d555041dc89ff172f6485 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 8 Mar 2019 19:16:02 +0100 Subject: [PATCH 082/207] Neaten set_morphology option on Tagger --- spacy/cli/ud/ud_train.py | 1 - spacy/pipeline/pipes.pyx | 11 ++--------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/spacy/cli/ud/ud_train.py b/spacy/cli/ud/ud_train.py index f172c8d78..b10b50403 100644 --- a/spacy/cli/ud/ud_train.py +++ b/spacy/cli/ud/ud_train.py @@ -345,7 +345,6 @@ def initialize_pipeline(nlp, docs, golds, config, device): nlp.add_pipe(nlp.create_pipe("tagger", set_morphology=False)) nlp.add_pipe(nlp.create_pipe("morphologizer")) nlp.add_pipe(nlp.create_pipe("parser")) - assert not nlp.get_pipe("tagger").set_morphology if config.multitask_tag: nlp.parser.add_multitask_objective("tag") if config.multitask_sent: diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 52cfe0b44..fa90603bc 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -357,14 +357,6 @@ class Tagger(Pipe): self.cfg = OrderedDict(sorted(cfg.items())) self.cfg.setdefault("cnn_maxout_pieces", 2) - @property - def set_morphology(self): - return self.cfg.get("set_morphology", True) - - @set_morphology.setter - def set_morphology(self, value): - self.cfg["set_morphology"] = value - @property def labels(self): return tuple(self.vocab.morphology.tag_names) @@ -412,6 +404,7 @@ class Tagger(Pipe): cdef Doc doc cdef int idx = 0 cdef Vocab vocab = self.vocab + assign_morphology = self.cfg.get("set_morphology", True) for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): @@ -419,7 +412,7 @@ class Tagger(Pipe): for j, tag_id in enumerate(doc_tag_ids): # Don't clobber preset POS tags if doc.c[j].tag == 0: - if doc.c[j].pos == 0 and self.set_morphology: + if doc.c[j].pos == 0 and assign_morphology: # Don't clobber preset lemmas lemma = doc.c[j].lemma vocab.morphology.assign_tag_id(&doc.c[j], tag_id) From c4df89ab908d0b46e7b5c735d854a54011e5cf30 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sat, 9 Mar 2019 00:20:11 +0000 Subject: [PATCH 083/207] Fixes for morphologizer --- spacy/cli/ud/ud_train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/ud/ud_train.py b/spacy/cli/ud/ud_train.py index f172c8d78..d94d05755 100644 --- a/spacy/cli/ud/ud_train.py +++ b/spacy/cli/ud/ud_train.py @@ -304,7 +304,7 @@ def get_token_conllu(token, i): replacements = {"one": "1", "two": "2", "three": "3"} for feat in features: if not feat.startswith("begin") and not feat.startswith("end"): - key, value = feat.split("_") + key, value = feat.split("_", 1) value = replacements.get(value, value) feat_str.append("%s=%s" % (key, value.title())) if not feat_str: @@ -342,7 +342,7 @@ def load_nlp(corpus, config, vectors=None): def initialize_pipeline(nlp, docs, golds, config, device): - nlp.add_pipe(nlp.create_pipe("tagger", set_morphology=False)) + nlp.add_pipe(nlp.create_pipe("tagger", config={"set_morphology": False})) nlp.add_pipe(nlp.create_pipe("morphologizer")) nlp.add_pipe(nlp.create_pipe("parser")) assert not nlp.get_pipe("tagger").set_morphology From 42bc3ad73bbf4434b799e341903dcea7d9cd8401 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sat, 9 Mar 2019 00:20:29 +0000 Subject: [PATCH 084/207] Fix class mapping for morphologizer --- spacy/morphology.pyx | 10 +++++++--- spacy/pipeline/morphologizer.pyx | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index cead0756b..eab7c9e31 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1066,16 +1066,20 @@ FEATURES = [ FEATURE_NAMES = {get_string_id(name): name for name in FEATURES} FEATURE_FIELDS = {feature: FIELDS[feature.split('_', 1)[0]] for feature in FEATURES} +FIELD_SIZES = Counter(FEATURE_FIELDS.values()) +for field in FIELD_SIZES: + FIELD_SIZES[field] += 1 for feat_id, name in FEATURE_NAMES.items(): FEATURE_FIELDS[feat_id] = FEATURE_FIELDS[name] - -FIELD_SIZES = Counter(FEATURE_FIELDS.values()) +# Mapping of feature names to their position in total vector FEATURE_OFFSETS = {} +# Mapping of field names to their first position in total vector. FIELD_OFFSETS = {} _seen_fields = Counter() for i, feature in enumerate(FEATURES): field = FEATURE_FIELDS[feature] - FEATURE_OFFSETS[feature] = _seen_fields[field] + # Add 1 for the NIL class, on each field + FEATURE_OFFSETS[feature] = _seen_fields[field] + 1 if _seen_fields[field] == 0: FIELD_OFFSETS[field] = i _seen_fields[field] += 1 diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 7d0ad42cf..589373f80 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -88,7 +88,7 @@ class Morphologizer(Pipe): if doc_guesses[j, k] == 0: doc_feat_ids[j, k] = 0 else: - doc_feat_ids[j, k] = offset + doc_guesses[j, k] + doc_feat_ids[j, k] = offset + (doc_guesses[j, k]-1) # Get the set of feature names. feats = {FEATURES[f] for f in doc_feat_ids[j] if f != 0} # Now add the analysis, and set the hash. From 4c8730526bd3d538db350b2f913c2510bf583b10 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sat, 9 Mar 2019 00:41:34 +0000 Subject: [PATCH 085/207] Filter bad retokenizations --- spacy/cli/ud/ud_train.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/spacy/cli/ud/ud_train.py b/spacy/cli/ud/ud_train.py index d94d05755..fb2003d31 100644 --- a/spacy/cli/ud/ud_train.py +++ b/spacy/cli/ud/ud_train.py @@ -231,9 +231,14 @@ def write_conllu(docs, file_): for i, doc in enumerate(docs): matches = merger(doc) spans = [doc[start : end + 1] for _, start, end in matches] + seen_tokens = set() with doc.retokenize() as retokenizer: for span in spans: - retokenizer.merge(span) + span_tokens = set(range(span.start, span.end)) + if not span_tokens.intersection(seen_tokens): + retokenizer.merge(span) + seen_tokens.update(span_tokens) + file_.write("# newdoc id = {i}\n".format(i=i)) for j, sent in enumerate(doc.sents): file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) From eae384ebb2ac6c1b32f97b09a9118ae286659da2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sat, 9 Mar 2019 11:49:44 +0000 Subject: [PATCH 086/207] Add POS to morphological fields --- spacy/morphology.pyx | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index eab7c9e31..6b1b7fc27 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -18,6 +18,7 @@ from .errors import Errors cdef enum univ_field_t: + Field_POS Field_Abbr Field_AdpType Field_AdvType @@ -429,6 +430,8 @@ cdef list list_features(const MorphAnalysisC* tag): cdef attr_t get_field(const MorphAnalysisC* tag, int field_id) nogil: field = <univ_field_t>field_id + if field == Field_POS: + return tag.pos if field == Field_Abbr: return tag.abbr elif field == Field_AdpType: @@ -617,12 +620,14 @@ cdef int set_feature(MorphAnalysisC* tag, else: value_ = 0 prev_value = get_field(tag, field) - if prev_value != 0 and value_ == 0: + if prev_value != 0 and value_ == 0 and field != Field_POS: tag.length -= 1 - elif prev_value == 0 and value_ != 0: + elif prev_value == 0 and value_ != 0 and field != Field_POS: tag.length += 1 if feature == 0: pass + elif field == Field_POS: + tag.pos = get_string_id(FEATURE_NAMES[value_].split('_')[1]) elif field == Field_Abbr: tag.abbr = value_ elif field == Field_AdpType: @@ -714,6 +719,7 @@ cdef int set_feature(MorphAnalysisC* tag, FIELDS = { + 'POS': Field_POS, 'Abbr': Field_Abbr, 'AdpType': Field_AdpType, 'AdvType': Field_AdvType, @@ -760,6 +766,7 @@ FIELDS = { } LOWER_FIELDS = { + 'pos': Field_POS, 'abbr': Field_Abbr, 'adp_type': Field_AdpType, 'adv_type': Field_AdvType, @@ -807,6 +814,26 @@ LOWER_FIELDS = { FEATURES = [ + "POS_ADJ", + "POS_ADP", + "POS_ADV", + "POS_AUX", + "POS_CONJ", + "POS_CCONJ", + "POS_DET", + "POS_INTJ", + "POS_NOUN", + "POS_NUM", + "POS_PART", + "POS_PRON", + "POS_PROPN", + "POS_PUNCT", + "POS_SCONJ", + "POS_SYM", + "POS_VERB", + "POS_X", + "POS_EOL", + "POS_SPACE", "Abbr_yes", "AdpType_circ", "AdpType_comprep", @@ -1064,7 +1091,6 @@ FEATURES = [ ] FEATURE_NAMES = {get_string_id(name): name for name in FEATURES} - FEATURE_FIELDS = {feature: FIELDS[feature.split('_', 1)[0]] for feature in FEATURES} FIELD_SIZES = Counter(FEATURE_FIELDS.values()) for field in FIELD_SIZES: From e1a83d15ed53a0bc9779182bdf1732cd6f722918 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sat, 9 Mar 2019 11:50:08 +0000 Subject: [PATCH 087/207] Add support for character features to Tok2Vec --- spacy/_ml.py | 103 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 94 insertions(+), 9 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 68dedc0b3..85d80c3f1 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -15,7 +15,7 @@ from thinc.api import uniqued, wrap, noop from thinc.api import with_square_sequences from thinc.linear.linear import LinearModel from thinc.neural.ops import NumpyOps, CupyOps -from thinc.neural.util import get_array_module +from thinc.neural.util import get_array_module, copy_array from thinc.neural.optimizers import Adam from thinc import describe @@ -273,6 +273,9 @@ def Tok2Vec(width, embed_size, **kwargs): pretrained_vectors = kwargs.get("pretrained_vectors", None) cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3) subword_features = kwargs.get("subword_features", True) + char_embed = kwargs.get("char_embed", False) + if char_embed: + subword_features = False conv_depth = kwargs.get("conv_depth", 4) bilstm_depth = kwargs.get("bilstm_depth", 0) cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] @@ -295,7 +298,7 @@ def Tok2Vec(width, embed_size, **kwargs): if pretrained_vectors is not None: glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID)) - if subword_features: + if subword_features: embed = uniqued( (glove | norm | prefix | suffix | shape) >> LN(Maxout(width, width * 5, pieces=3)), @@ -310,8 +313,14 @@ def Tok2Vec(width, embed_size, **kwargs): embed = uniqued( (norm | prefix | suffix | shape) >> LN(Maxout(width, width * 4, pieces=3)), - column=cols.index(ORTH), + column=cols.index(ORTH) ) + elif char_embed: + embed = concatenate_lists( + CharacterEmbed(nM=64, nC=8), + FeatureExtracter(cols) >> with_flatten(norm) + ) + reduce_dimensions = LN(Maxout(width, 64*8+width, pieces=cnn_maxout_pieces)) else: embed = norm @@ -319,9 +328,23 @@ def Tok2Vec(width, embed_size, **kwargs): ExtractWindow(nW=1) >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces)) ) - tok2vec = FeatureExtracter(cols) >> with_flatten( - embed >> convolution ** conv_depth, pad=conv_depth - ) + if char_embed: + tok2vec = ( + embed + >> with_flatten( + reduce_dimensions + >> convolution ** conv_depth, pad=conv_depth + ) + ) + else: + tok2vec = ( + FeatureExtracter(cols) + >> with_flatten( + embed + >> convolution ** conv_depth, pad=conv_depth + ) + ) + if bilstm_depth >= 1: tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth) # Work around thinc API limitations :(. TODO: Revise in Thinc 7 @@ -537,7 +560,7 @@ def build_morphologizer_model(class_nums, **cfg): else: token_vector_width = util.env_opt("token_vector_width", 128) pretrained_vectors = cfg.get("pretrained_vectors") - subword_features = cfg.get("subword_features", True) + char_embed = cfg.get("char_embed", True) with Model.define_operators({">>": chain, "+": add}): if "tok2vec" in cfg: tok2vec = cfg["tok2vec"] @@ -545,7 +568,7 @@ def build_morphologizer_model(class_nums, **cfg): tok2vec = Tok2Vec( token_vector_width, embed_size, - subword_features=subword_features, + char_embed=char_embed, pretrained_vectors=pretrained_vectors, ) softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width)) @@ -688,7 +711,8 @@ def concatenate_lists(*layers, **kwargs): # pragma: no cover concat = concatenate(*layers) def concatenate_lists_fwd(Xs, drop=0.0): - drop *= drop_factor + if drop is not None: + drop *= drop_factor lengths = ops.asarray([len(X) for X in Xs], dtype="i") flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop) ys = ops.unflatten(flat_y, lengths) @@ -776,3 +800,64 @@ def _replace_word(word, random_words, mask="[MASK]"): return random_words.next() else: return word + + +def _uniform_init(lo, hi): + def wrapped(W, ops): + copy_array(W, ops.xp.random.uniform(lo, hi, W.shape)) + return wrapped + + +@describe.attributes( + nM=Dimension("Vector dimensions"), + nC=Dimension("Number of characters per word"), + vectors=Synapses("Embed matrix", + lambda obj: (obj.nC, obj.nV, obj.nM), + _uniform_init(-0.1, 0.1)), + d_vectors=Gradient("vectors") +) +class CharacterEmbed(Model): + def __init__(self, nM=None, nC=None, **kwargs): + Model.__init__(self, **kwargs) + self.nM = nM + self.nC = nC + + @property + def nO(self): + return self.nM * self.nC + + @property + def nV(self): + return 256 + + def begin_update(self, docs, drop=0.): + if not docs: + return [] + ids = [] + output = [] + weights = self.vectors + # This assists in indexing; it's like looping over this dimension. + # Still consider this weird witch craft...But thanks to Mark Neumann + # for the tip. + nCv = self.ops.xp.arange(self.nC) + for doc in docs: + doc_ids = doc.to_utf8_array(nr_char=self.nC) + doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM)) + # Let's say I have a 2d array of indices, and a 3d table of data. What numpy + # incantation do I chant to get + # output[i, j, k] == data[j, ids[i, j], k]? + doc_vectors[:, nCv] = weights[nCv, doc_ids[:, nCv]] + output.append(doc_vectors.reshape((len(doc), self.nO))) + ids.append(doc_ids) + + def backprop_character_embed(d_vectors, sgd=None): + gradient = self.d_vectors + for doc_ids, d_doc_vectors in zip(ids, d_vectors): + d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), self.nC, self.nM)) + gradient[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv] + if sgd is not None: + sgd(self._mem.weights, self._mem.gradient, key=self.id) + return None + return output, backprop_character_embed + + From bba5f57f91635a254d60c6f517fc7933d21dad6e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sat, 9 Mar 2019 11:50:27 +0000 Subject: [PATCH 088/207] Add method to export utf8 array to Doc --- spacy/tokens/doc.pyx | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 1dfcd1687..378921f3c 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1022,6 +1022,37 @@ cdef class Doc: data["_"][attr] = value return data + def to_utf8_array(self, int nr_char=-1): + """Encode word strings to utf8, and export to a fixed-width array + of characters. Characters are placed into the array in the order: + 0, -1, 1, -2, etc + For example, if the array is sliced array[:, :8], the array will + contain the first 4 characters and last 4 characters of each word --- + with the middle characters clipped out. The value 255 is used as a pad + value. + """ + byte_strings = [token.orth_.encode('utf8') for token in self] + if nr_char == -1: + nr_char = max(len(bs) for bs in byte_strings) + cdef np.ndarray output = numpy.zeros((len(byte_strings), nr_char), dtype='uint8') + output.fill(255) + cdef int i, j, start_idx, end_idx + cdef bytes byte_string + cdef unsigned char utf8_char + for i, byte_string in enumerate(byte_strings): + j = 0 + start_idx = 0 + end_idx = len(byte_string) - 1 + while j < nr_char and start_idx <= end_idx: + output[i, j] = <unsigned char>byte_string[start_idx] + start_idx += 1 + j += 1 + if j < nr_char and start_idx <= end_idx: + output[i, j] = <unsigned char>byte_string[end_idx] + end_idx -= 1 + j += 1 + return output + cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2: cdef int i From a6d153b0a0a9dcbefeca906cda37329ca15d7ce2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sat, 9 Mar 2019 11:50:50 +0000 Subject: [PATCH 089/207] Add UPOS as morphological field in ud_train --- spacy/cli/ud/ud_train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/cli/ud/ud_train.py b/spacy/cli/ud/ud_train.py index 23f5a6820..5dcaa1684 100644 --- a/spacy/cli/ud/ud_train.py +++ b/spacy/cli/ud/ud_train.py @@ -84,6 +84,7 @@ def read_data( sent["words"].append(word) sent["tags"].append(tag) sent["morphology"].append(_parse_morph_string(morph)) + sent["morphology"][-1].add("POS_%s" % pos) sent["heads"].append(head) sent["deps"].append("ROOT" if dep == "root" else dep) sent["spaces"].append(space_after == "_") From f742900f83296362bbca371b9b08785141752809 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sat, 9 Mar 2019 11:51:11 +0000 Subject: [PATCH 090/207] Set pos attribute in morphologizer --- spacy/pipeline/morphologizer.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 589373f80..9cb384a03 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -94,6 +94,8 @@ class Morphologizer(Pipe): # Now add the analysis, and set the hash. try: doc.c[j].morph = self.vocab.morphology.add(feats) + if doc[j].morph.pos != 0: + doc.c[j].pos = doc[j].morph.pos except: print(offsets) print(doc_guesses[j]) From 41a3016019782d66620fe830bcd13a9f76fbf013 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sat, 9 Mar 2019 20:55:33 +0100 Subject: [PATCH 091/207] Refactor morphologizer class map --- spacy/morphology.pxd | 2 +- spacy/morphology.pyx | 90 +++++++++++++++----------------- spacy/pipeline/morphologizer.pyx | 30 +++++------ spacy/tokens/morphanalysis.pyx | 3 +- 4 files changed, 58 insertions(+), 67 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index cb708166c..1a3cedf97 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -20,6 +20,7 @@ cdef class Morphology: cdef readonly object tag_names cdef readonly object reverse_index cdef readonly object exc + cdef readonly object _feat_map cdef readonly PreshMapArray _cache cdef readonly int n_tags @@ -36,6 +37,5 @@ cdef class Morphology: cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil cdef attr_t get_field(const MorphAnalysisC* tag, int field) nogil cdef list list_features(const MorphAnalysisC* tag) -cdef int attribute_to_field(unicode attribute) cdef tag_to_json(const MorphAnalysisC* tag) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 6b1b7fc27..fdaa44813 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -93,26 +93,34 @@ def _normalize_props(props): return out -def parse_feature(feature): - field = FEATURE_FIELDS[feature] - offset = FEATURE_OFFSETS[feature] - return (field, offset) +class MorphologyClassMap(object): + def __init__(self, features, fields): + self.features = tuple(features) + self.fields = tuple(fields) + self.id2feat = {get_string_id(name): name for name in features} + self.feat2field = {feature: fields[feature.split('_', 1)[0]] for feature in features} + self.field2feats = {} + self.col2info = [] + self.attr2field = dict(LOWER_FIELDS.items()) + for feature in features: + field = self.feat2field[feature] + if field not in self.field2feats: + self.col2info.append((field, 0, 'NIL')) + self.field2feats.setdefault(field, []).append(feature) + self.col2info.append((field, len(self.field2feats[field]), feature)) + @property + def field_sizes(self): + return [len(self.field2feats[field]) for field in self.fields] -cdef int attribute_to_field(unicode attribute_name): - return LOWER_FIELDS[attribute_name] - - -def get_field_id(feature): - return FEATURE_FIELDS[feature] - - -def get_field_size(field): - return FIELD_SIZES[FIELDS[field]] - - -def get_field_offset(field): - return FIELD_OFFSETS[FIELDS[field]] + def get_field_offset(self, field): + n = 0 + for f in self.fields: + if f == field: + return n + n += len(self.field2feats[f]) + else: + return -1 cdef class Morphology: @@ -139,9 +147,11 @@ cdef class Morphology: self.lemmatizer = lemmatizer self.n_tags = len(tag_map) self.reverse_index = {} + self._feat_map = MorphologyClassMap(FEATURES, FIELDS) for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): attrs = _normalize_props(attrs) - self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES}) + self.add({self._feat_map.id2feat[feat] for feat in attrs + if feat in self._feat_map.id2feat}) self.tag_map[tag_str] = dict(attrs) self.reverse_index[self.strings.add(tag_str)] = i @@ -167,7 +177,7 @@ cdef class Morphology: features = intify_features(features) cdef attr_t feature for feature in features: - if feature != 0 and feature not in FEATURE_NAMES: + if feature != 0 and feature not in self._feat_map.id2feat: raise KeyError("Unknown feature: %s" % self.strings[feature]) cdef MorphAnalysisC tag tag = create_rich_tag(features) @@ -187,7 +197,7 @@ cdef class Morphology: features = intify_features(features) cdef attr_t feature for feature in features: - field = get_field_id(feature) + field = FEATURE_FIELDS[FEATURE_NAMES[feature]] set_feature(&tag, field, feature, 1) morph = self.insert(tag) return morph @@ -224,7 +234,8 @@ cdef class Morphology: """ attrs = dict(attrs) attrs = _normalize_props(attrs) - self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES}) + self.add({self._feat_map.id2feat[feat] for feat in attrs + if feat in self._feat_map.id2feat}) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) self.exc[(tag_str, self.strings.add(orth_str))] = attrs @@ -313,6 +324,10 @@ cdef class Morphology: def from_disk(self, path): raise NotImplementedError + @classmethod + def create_class_map(cls): + return MorphologyClassMap(FEATURES, FIELDS) + cpdef univ_pos_t get_int_tag(pos_): return <univ_pos_t>0 @@ -324,17 +339,12 @@ cdef hash_t hash_tag(MorphAnalysisC tag) nogil: return mrmr.hash64(&tag, sizeof(tag), 0) -def get_feature_field(feature): - cdef attr_t key = get_string_id(feature) - return FEATURE_FIELDS[feature] - - cdef MorphAnalysisC create_rich_tag(features) except *: cdef MorphAnalysisC tag cdef attr_t feature memset(&tag, 0, sizeof(tag)) for feature in features: - field = get_field_id(feature) + field = FEATURE_FIELDS[FEATURE_NAMES[feature]] set_feature(&tag, field, feature, 1) return tag @@ -519,8 +529,7 @@ cdef attr_t get_field(const MorphAnalysisC* tag, int field_id) nogil: elif field == Field_VerbType: return tag.verb_type else: - raise ValueError("Unknown feature: %s (%d)" % (FEATURE_NAMES.get(feature), feature)) - + raise ValueError("Unknown field: (%d)" % field_id) cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil: @@ -1090,22 +1099,5 @@ FEATURES = [ "Voice_int", ] -FEATURE_NAMES = {get_string_id(name): name for name in FEATURES} -FEATURE_FIELDS = {feature: FIELDS[feature.split('_', 1)[0]] for feature in FEATURES} -FIELD_SIZES = Counter(FEATURE_FIELDS.values()) -for field in FIELD_SIZES: - FIELD_SIZES[field] += 1 -for feat_id, name in FEATURE_NAMES.items(): - FEATURE_FIELDS[feat_id] = FEATURE_FIELDS[name] -# Mapping of feature names to their position in total vector -FEATURE_OFFSETS = {} -# Mapping of field names to their first position in total vector. -FIELD_OFFSETS = {} -_seen_fields = Counter() -for i, feature in enumerate(FEATURES): - field = FEATURE_FIELDS[feature] - # Add 1 for the NIL class, on each field - FEATURE_OFFSETS[feature] = _seen_fields[field] + 1 - if _seen_fields[field] == 0: - FIELD_OFFSETS[field] = i - _seen_fields[field] += 1 +FEATURE_NAMES = {get_string_id(f): f for f in FEATURES} +FEATURE_FIELDS = {f: FIELDS[f.split('_', 1)[0]] for f in FEATURES} diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 9cb384a03..d3d850da0 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -16,26 +16,24 @@ from ..compat import basestring_ from ..tokens.doc cimport Doc from ..vocab cimport Vocab from ..morphology cimport Morphology -from ..morphology import get_field_size, get_field_offset, parse_feature, FIELDS -from ..morphology import FEATURES class Morphologizer(Pipe): name = 'morphologizer' @classmethod - def Model(cls, attr_nums=None, **cfg): + def Model(cls, **cfg): if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'): raise ValueError(TempErrors.T008) - if attr_nums is None: - attr_nums = [get_field_size(name) for name in FIELDS] - return build_morphologizer_model(attr_nums, **cfg) + class_map = Morphology.create_class_map() + return build_morphologizer_model(class_map.field_sizes, **cfg) def __init__(self, vocab, model=True, **cfg): self.vocab = vocab self.model = model self.cfg = OrderedDict(sorted(cfg.items())) self.cfg.setdefault('cnn_maxout_pieces', 2) + self._class_map = self.vocab.morphology.create_class_map() @property def labels(self): @@ -76,13 +74,13 @@ class Morphologizer(Pipe): docs = [docs] cdef Doc doc cdef Vocab vocab = self.vocab - field_names = list(FIELDS) - offsets = [get_field_offset(field) for field in field_names] + offsets = [self._class_map.get_field_offset(field) + for field in self._class_map.fields] for i, doc in enumerate(docs): doc_scores = batch_scores[i] doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes) # Convert the neuron indices into feature IDs. - doc_feat_ids = numpy.zeros((len(doc), len(field_names)), dtype='i') + doc_feat_ids = numpy.zeros((len(doc), len(self._class_map.fields)), dtype='i') for j in range(len(doc)): for k, offset in enumerate(offsets): if doc_guesses[j, k] == 0: @@ -90,7 +88,8 @@ class Morphologizer(Pipe): else: doc_feat_ids[j, k] = offset + (doc_guesses[j, k]-1) # Get the set of feature names. - feats = {FEATURES[f] for f in doc_feat_ids[j] if f != 0} + feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j] + if f != 0} # Now add the analysis, and set the hash. try: doc.c[j].morph = self.vocab.morphology.add(feats) @@ -132,14 +131,15 @@ class Morphologizer(Pipe): if features is None: target[idx] = scores[idx] else: - by_field = {} + gold_fields = {} for feature in features: - field, column = parse_feature(feature) - by_field[field] = column + field = self.get_field(feature) + column = self.get_column(feature) + gold_fields[field] = column col_offset = 0 for field, field_size in enumerate(field_sizes): - if field in by_field: - target[idx, col_offset + by_field[field]] = 1. + if field in gold_fields: + target[idx, col_offset + gold_fields[field]] = 1. else: target[idx, col_offset] = 1. col_offset += field_size diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index b727e2c3f..17b11d84f 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -3,7 +3,6 @@ from libc.string cimport memset from ..vocab cimport Vocab from ..typedefs cimport hash_t, attr_t from ..morphology cimport list_features, check_feature, get_field, tag_to_json -from ..morphology cimport attribute_to_field from ..strings import get_string_id @@ -53,7 +52,7 @@ cdef class MorphAnalysis: return self.key def get(self, unicode field): - cdef int field_id = attribute_to_field(field) + cdef int field_id = self.vocab.morphology._feat_map.attr2field[field] return self.vocab.strings[get_field(&self.c, field_id)] def to_json(self): From 0f120824657349f6c5b580d362912eb9cc971bd8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sat, 9 Mar 2019 22:54:59 +0000 Subject: [PATCH 092/207] Refactor morphologizer --- spacy/_ml.py | 6 ++-- spacy/morphology.pyx | 50 +++++++++++++++++++++----------- spacy/pipeline/morphologizer.pyx | 41 ++++++++++++-------------- spacy/pipeline/pipes.pyx | 2 ++ 4 files changed, 58 insertions(+), 41 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 85d80c3f1..2e4df843c 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -561,7 +561,7 @@ def build_morphologizer_model(class_nums, **cfg): token_vector_width = util.env_opt("token_vector_width", 128) pretrained_vectors = cfg.get("pretrained_vectors") char_embed = cfg.get("char_embed", True) - with Model.define_operators({">>": chain, "+": add}): + with Model.define_operators({">>": chain, "+": add, "**": clone}): if "tok2vec" in cfg: tok2vec = cfg["tok2vec"] else: @@ -571,7 +571,9 @@ def build_morphologizer_model(class_nums, **cfg): char_embed=char_embed, pretrained_vectors=pretrained_vectors, ) - softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width)) + softmax = with_flatten( + MultiSoftmax(class_nums, token_vector_width) + ) softmax.out_sizes = class_nums model = tok2vec >> softmax model.nI = None diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index fdaa44813..6c9ecebdc 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -46,8 +46,8 @@ cdef enum univ_field_t: Field_PartForm Field_PartType Field_Person - Field_Polite Field_Polarity + Field_Polite Field_Poss Field_Prefix Field_PrepCase @@ -60,8 +60,8 @@ cdef enum univ_field_t: Field_Tense Field_Typo Field_VerbForm - Field_Voice Field_VerbType + Field_Voice def _normalize_props(props): @@ -94,20 +94,36 @@ def _normalize_props(props): class MorphologyClassMap(object): - def __init__(self, features, fields): + def __init__(self, features): self.features = tuple(features) - self.fields = tuple(fields) + self.fields = [] + self.feat2field = {} + seen_fields = set() + for feature in features: + field = feature.split("_", 1)[0] + if field not in seen_fields: + self.fields.append(field) + seen_fields.add(field) + self.feat2field[feature] = FIELDS[field] self.id2feat = {get_string_id(name): name for name in features} - self.feat2field = {feature: fields[feature.split('_', 1)[0]] for feature in features} - self.field2feats = {} + self.field2feats = {"POS": []} self.col2info = [] self.attr2field = dict(LOWER_FIELDS.items()) + self.feat2offset = {} + self.field2col = {} + self.field2id = dict(FIELDS.items()) + self.fieldid2field = {field_id: field for field, field_id in FIELDS.items()} for feature in features: - field = self.feat2field[feature] - if field not in self.field2feats: - self.col2info.append((field, 0, 'NIL')) - self.field2feats.setdefault(field, []).append(feature) - self.col2info.append((field, len(self.field2feats[field]), feature)) + field = self.fields[self.feat2field[feature]] + if field not in self.field2col: + self.field2col[field] = len(self.col2info) + if field != "POS" and field not in self.field2feats: + self.col2info.append((field, 0, "NIL")) + self.field2feats.setdefault(field, ["NIL"]) + offset = len(self.field2feats[field]) + self.field2feats[field].append(feature) + self.col2info.append((field, offset, feature)) + self.feat2offset[feature] = offset @property def field_sizes(self): @@ -147,7 +163,7 @@ cdef class Morphology: self.lemmatizer = lemmatizer self.n_tags = len(tag_map) self.reverse_index = {} - self._feat_map = MorphologyClassMap(FEATURES, FIELDS) + self._feat_map = MorphologyClassMap(FEATURES) for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): attrs = _normalize_props(attrs) self.add({self._feat_map.id2feat[feat] for feat in attrs @@ -326,7 +342,7 @@ cdef class Morphology: @classmethod def create_class_map(cls): - return MorphologyClassMap(FEATURES, FIELDS) + return MorphologyClassMap(FEATURES) cpdef univ_pos_t get_int_tag(pos_): @@ -770,8 +786,8 @@ FIELDS = { 'Tense': Field_Tense, 'Typo': Field_Typo, 'VerbForm': Field_VerbForm, + 'VerbType': Field_VerbType, 'Voice': Field_Voice, - 'VerbType': Field_VerbType } LOWER_FIELDS = { @@ -803,8 +819,8 @@ LOWER_FIELDS = { 'part_form': Field_PartForm, 'part_type': Field_PartType, 'person': Field_Person, - 'polite': Field_Polite, 'polarity': Field_Polarity, + 'polite': Field_Polite, 'poss': Field_Poss, 'prefix': Field_Prefix, 'prep_case': Field_PrepCase, @@ -817,8 +833,8 @@ LOWER_FIELDS = { 'tense': Field_Tense, 'typo': Field_Typo, 'verb_form': Field_VerbForm, + 'verb_type': Field_VerbType, 'voice': Field_Voice, - 'verb_type': Field_VerbType } @@ -849,7 +865,7 @@ FEATURES = [ "AdpType_prep", "AdpType_post", "AdpType_voc", - "AdvType_adadj," + "AdvType_adadj", "AdvType_cau", "AdvType_deg", "AdvType_ex", diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index d3d850da0..b14e2bec7 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -86,20 +86,15 @@ class Morphologizer(Pipe): if doc_guesses[j, k] == 0: doc_feat_ids[j, k] = 0 else: - doc_feat_ids[j, k] = offset + (doc_guesses[j, k]-1) + doc_feat_ids[j, k] = offset + doc_guesses[j, k] # Get the set of feature names. - feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j] - if f != 0} + feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]} + if "NIL" in feats: + feats.remove("NIL") # Now add the analysis, and set the hash. - try: - doc.c[j].morph = self.vocab.morphology.add(feats) - if doc[j].morph.pos != 0: - doc.c[j].pos = doc[j].morph.pos - except: - print(offsets) - print(doc_guesses[j]) - print(doc_feat_ids[j]) - raise + doc.c[j].morph = self.vocab.morphology.add(feats) + if doc[j].morph.pos != 0: + doc.c[j].pos = doc[j].morph.pos def update(self, docs, golds, drop=0., sgd=None, losses=None): if losses is not None and self.name not in losses: @@ -126,23 +121,25 @@ class Morphologizer(Pipe): # Do this on CPU, as we can't vectorize easily. target = numpy.zeros(scores.shape, dtype='f') field_sizes = self.model.softmax.out_sizes - for gold in golds: - for features in gold.morphology: + for doc, gold in zip(docs, golds): + for t, features in enumerate(gold.morphology): if features is None: target[idx] = scores[idx] else: gold_fields = {} for feature in features: - field = self.get_field(feature) - column = self.get_column(feature) - gold_fields[field] = column - col_offset = 0 - for field, field_size in enumerate(field_sizes): - if field in gold_fields: - target[idx, col_offset + gold_fields[field]] = 1. + field = self._class_map.feat2field[feature] + gold_fields[field] = self._class_map.feat2offset[feature] + for field in self._class_map.fields: + field_id = self._class_map.field2id[field] + col_offset = self._class_map.field2col[field] + if field_id in gold_fields: + target[idx, col_offset + gold_fields[field_id]] = 1. else: target[idx, col_offset] = 1. - col_offset += field_size + #print(doc[t]) + #for col, info in enumerate(self._class_map.col2info): + # print(col, info, scores[idx, col], target[idx, col]) idx += 1 target = self.model.ops.asarray(target, dtype='f') scores = self.model.ops.asarray(scores, dtype='f') diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index fa90603bc..450497f3b 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -418,6 +418,8 @@ class Tagger(Pipe): vocab.morphology.assign_tag_id(&doc.c[j], tag_id) if lemma != 0 and lemma != doc.c[j].lex.orth: doc.c[j].lemma = lemma + else: + doc.c[j].tag = self.vocab.strings[self.labels[tag_id]] idx += 1 if tensors is not None and len(tensors): if isinstance(doc.tensor, numpy.ndarray) \ From 5431c47b915d821e231d9f5d1df4041fcd57fadb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sun, 10 Mar 2019 00:59:51 +0000 Subject: [PATCH 093/207] Refactor morphology slightly --- spacy/morphology.pyx | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 6c9ecebdc..c592ee674 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -130,13 +130,7 @@ class MorphologyClassMap(object): return [len(self.field2feats[field]) for field in self.fields] def get_field_offset(self, field): - n = 0 - for f in self.fields: - if f == field: - return n - n += len(self.field2feats[f]) - else: - return -1 + return self.field2col[field] cdef class Morphology: From 08e8267a595aea045a2c8bd0eaa9ab16ffb03e12 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sun, 25 Aug 2019 13:50:00 +0200 Subject: [PATCH 094/207] Set version to 2.2.0.dev0 --- spacy/about.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/about.py b/spacy/about.py index 9587c9071..bd500ed6c 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,13 +4,13 @@ # fmt: off __title__ = "spacy" -__version__ = "2.1.8" +__version__ = "2.2.0.dev0" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" __email__ = "contact@explosion.ai" __license__ = "MIT" -__release__ = True +__release__ = False __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From f9075a6fd1dcb46611d5b25f6cd569ddd1bd9256 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sun, 25 Aug 2019 13:50:47 +0200 Subject: [PATCH 095/207] Update to blis 0.4 and thinc 7.1 --- requirements.txt | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index a6d721e96..865288a86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=2.0.1,<2.1.0 -thinc>=7.0.8,<7.1.0 -blis>=0.2.2,<0.3.0 +thinc>=7.1.0,<7.2.0 +blis>=0.4.0,<0.5.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.2.0,<1.1.0 srsly>=0.1.0,<1.1.0 diff --git a/setup.py b/setup.py index 1d2aa084b..783433611 100755 --- a/setup.py +++ b/setup.py @@ -247,7 +247,7 @@ def setup_package(): "cymem>=2.0.2,<2.1.0", "preshed>=2.0.1,<2.1.0", "thinc>=7.0.8,<7.1.0", - "blis>=0.2.2,<0.3.0", + "blis>=0.4.0,<0.5.0", "plac<1.0.0,>=0.9.6", "requests>=2.13.0,<3.0.0", "wasabi>=0.2.0,<1.1.0", From b8edc8dffb7fb5651973181a46ced5da5ec1a7cf Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sun, 25 Aug 2019 14:54:09 +0200 Subject: [PATCH 096/207] Require thinc 7.1 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 783433611..1d13bfd10 100755 --- a/setup.py +++ b/setup.py @@ -246,7 +246,7 @@ def setup_package(): "murmurhash>=0.28.0,<1.1.0", "cymem>=2.0.2,<2.1.0", "preshed>=2.0.1,<2.1.0", - "thinc>=7.0.8,<7.1.0", + "thinc>=7.1.0,<7.2.0", "blis>=0.4.0,<0.5.0", "plac<1.0.0,>=0.9.6", "requests>=2.13.0,<3.0.0", From 7bc68913e38c54c49d867df8df57738987e32769 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sun, 25 Aug 2019 14:54:19 +0200 Subject: [PATCH 097/207] Improve pex building in Makefile --- Makefile | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 2834096b7..0f5c31ca6 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,17 @@ SHELL := /bin/bash sha = $(shell "git" "rev-parse" "--short" "HEAD") +version = $(shell "bin/get-version.sh") +wheel = spacy-$(version)-cp36-cp36m-linux_x86_64.whl -dist/spacy.pex : spacy/*.py* spacy/*/*.py* +dist/spacy.pex : dist/spacy-$(sha).pex + cp dist/spacy-$(sha).pex dist/spacy.pex + chmod a+rx dist/spacy.pex + +dist/spacy-$(sha).pex : dist/$(wheel) + env3.6/bin/python -m pip install pex==1.5.3 + env3.6/bin/pex pytest dist/$(wheel) -e spacy -o dist/spacy-$(sha).pex + +dist/$(wheel) : setup.py spacy/*.py* spacy/*/*.py* python3.6 -m venv env3.6 source env3.6/bin/activate env3.6/bin/pip install wheel @@ -9,10 +19,6 @@ dist/spacy.pex : spacy/*.py* spacy/*/*.py* env3.6/bin/python setup.py build_ext --inplace env3.6/bin/python setup.py sdist env3.6/bin/python setup.py bdist_wheel - env3.6/bin/python -m pip install pex==1.5.3 - env3.6/bin/pex pytest dist/*.whl -e spacy -o dist/spacy-$(sha).pex - cp dist/spacy-$(sha).pex dist/spacy.pex - chmod a+rx dist/spacy.pex .PHONY : clean From 9b5c94fed9fecfb9fd078a9a4616ca1972b0a405 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sun, 25 Aug 2019 15:12:36 +0200 Subject: [PATCH 098/207] Add get-version script --- bin/get-version.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100755 bin/get-version.sh diff --git a/bin/get-version.sh b/bin/get-version.sh new file mode 100755 index 000000000..5a12ddd7a --- /dev/null +++ b/bin/get-version.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +set -e + +version=$(grep "__version__ = " spacy/about.py) +version=${version/__version__ = } +version=${version/\'/} +version=${version/\'/} +version=${version/\"/} +version=${version/\"/} + +echo $version From 22250cf6b7cd3a92290f54256078606c1e5db5e5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sun, 25 Aug 2019 21:54:26 +0200 Subject: [PATCH 099/207] Make regression test less sensitive to tag-map stuff --- spacy/tests/regression/test_issue3001-3500.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 3b0c2f1ed..c430678d3 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -30,20 +30,20 @@ def test_issue3002(): def test_issue3009(en_vocab): """Test problem with matcher quantifiers""" patterns = [ - [{"LEMMA": "have"}, {"LOWER": "to"}, {"LOWER": "do"}, {"POS": "ADP"}], + [{"LEMMA": "have"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}], [ {"LEMMA": "have"}, {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"}, {"LOWER": "to"}, {"LOWER": "do"}, - {"POS": "ADP"}, + {"TAG": "IN"}, ], [ {"LEMMA": "have"}, {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"}, {"LOWER": "to"}, {"LOWER": "do"}, - {"POS": "ADP"}, + {"TAG": "IN"}, ], ] words = ["also", "has", "to", "do", "with"] From 095c63c6b8e3fc8d1da2c914a996ddecba19864f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sun, 25 Aug 2019 21:56:47 +0200 Subject: [PATCH 100/207] Avoid making prepositions get the tag SCONJ --- spacy/lang/en/morph_rules.py | 48 +++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/spacy/lang/en/morph_rules.py b/spacy/lang/en/morph_rules.py index 198182ff0..b00534cc5 100644 --- a/spacy/lang/en/morph_rules.py +++ b/spacy/lang/en/morph_rules.py @@ -3,55 +3,59 @@ from __future__ import unicode_literals from ...symbols import LEMMA, PRON_LEMMA +# Several entries here look pretty suspicious. These will get the POS SCONJ +# given the tag IN, when an adpositional reading seems much more likely for +# a lot of these prepositions. I'm not sure what I was running in 04395ffa4 +# when I did this? It doesn't seem right. _subordinating_conjunctions = [ "that", "if", "as", "because", - "of", - "for", - "before", - "in", + #"of", + #"for", + #"before", + #"in", "while", - "after", + #"after", "since", "like", - "with", + #"with", "so", - "to", - "by", - "on", - "about", + #"to", + #"by", + #"on", + #"about", "than", "whether", "although", - "from", + #"from", "though", - "until", + #"until", "unless", "once", - "without", - "at", - "into", + #"without", + #"at", + #"into", "cause", - "over", + #"over", "upon", "till", "whereas", - "beyond", + #"beyond", "whilst", "except", "despite", "wether", - "then", + #"then", "but", "becuse", "whie", - "below", - "against", + #"below", + #"against", "it", "w/out", - "toward", + #"toward", "albeit", "save", "besides", @@ -63,7 +67,7 @@ _subordinating_conjunctions = [ "out", "near", "seince", - "towards", + #"towards", "tho", "sice", "will", From 188a1cf297860197dc0253165d0969c7520acf66 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sun, 25 Aug 2019 21:57:02 +0200 Subject: [PATCH 101/207] Fix morphology for | features --- spacy/morphology.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 2fd81bf8e..ccfb214bc 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -71,6 +71,10 @@ def _normalize_props(props): for key in FIELDS: if key in props: value = str(props[key]).lower() + # We don't have support for disjunctive int|rel features, so + # just take the first one :( + if "|" in value: + value = value.split("|")[0] attr = '%s_%s' % (key, value) if attr in FEATURES: props.pop(key) From 71c0321ecf03ecbd0566295980472f7f70b037bc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sun, 25 Aug 2019 22:03:37 +0200 Subject: [PATCH 102/207] Fix test --- spacy/tests/test_displacy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index 5e99d261a..2d1f1bd8f 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -32,7 +32,7 @@ def test_displacy_parse_deps(en_vocab): assert isinstance(deps, dict) assert deps["words"] == [ {"text": "This", "tag": "DET"}, - {"text": "is", "tag": "VERB"}, + {"text": "is", "tag": "AUX"}, {"text": "a", "tag": "DET"}, {"text": "sentence", "tag": "NOUN"}, ] From af7fad2c6d75ea3ce755058c1acaee3a827d77d2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sun, 25 Aug 2019 22:05:47 +0200 Subject: [PATCH 103/207] Set version to v2.2.0.dev1 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index bd500ed6c..6a9784c54 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # fmt: off __title__ = "spacy" -__version__ = "2.2.0.dev0" +__version__ = "2.2.0.dev1" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" From aae05ff16bcf12e4f60bb5936c3bf5728a96b78c Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Wed, 28 Aug 2019 09:14:20 +0200 Subject: [PATCH 104/207] Add train_docs() option to add orth variants Filtering by orth and tag, create variants of training docs with alternate orth variants, e.g., unicode quotes, dashes, and ellipses. The variants can be single tokens (dashes) or paired tokens (quotes) with left and right versions. Currently restricted to only add variants to training documents without raw text provided, where only gold.words needs to be modified. --- spacy/gold.pyx | 61 +++++++++++++++++++++++++++++++++++++++++------ spacy/language.py | 2 ++ 2 files changed, 56 insertions(+), 7 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index f6ec8d3fa..1cd49814a 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -7,6 +7,7 @@ import random import numpy import tempfile import shutil +import itertools from pathlib import Path import srsly @@ -206,13 +207,14 @@ class GoldCorpus(object): return n def train_docs(self, nlp, gold_preproc=False, max_length=None, - noise_level=0.0): + noise_level=0.0, orth_variant_level=0.0): locs = list((self.tmp_dir / 'train').iterdir()) random.shuffle(locs) train_tuples = self.read_tuples(locs, limit=self.limit) gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, max_length=max_length, noise_level=noise_level, + orth_variant_level=orth_variant_level, make_projective=True) yield from gold_docs @@ -226,27 +228,31 @@ class GoldCorpus(object): @classmethod def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None, - noise_level=0.0, make_projective=False): + noise_level=0.0, orth_variant_level=0.0, make_projective=False): for raw_text, paragraph_tuples in tuples: if gold_preproc: raw_text = None else: paragraph_tuples = merge_sents(paragraph_tuples) - docs = cls._make_docs(nlp, raw_text, paragraph_tuples, gold_preproc, - noise_level=noise_level) + docs, paragraph_tuples = cls._make_docs(nlp, raw_text, + paragraph_tuples, gold_preproc, noise_level=noise_level, + orth_variant_level=orth_variant_level) golds = cls._make_golds(docs, paragraph_tuples, make_projective) for doc, gold in zip(docs, golds): if (not max_length) or len(doc) < max_length: yield doc, gold @classmethod - def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0): + def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0, orth_variant_level=0.0): if raw_text is not None: raw_text = add_noise(raw_text, noise_level) - return [nlp.make_doc(raw_text)] + return [nlp.make_doc(raw_text)], paragraph_tuples else: + docs = [] + raw_text, paragraph_tuples = make_orth_variants(nlp, None, paragraph_tuples, orth_variant_level) return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level)) - for (sent_tuples, brackets) in paragraph_tuples] + for (sent_tuples, brackets) in paragraph_tuples], paragraph_tuples + @classmethod def _make_golds(cls, docs, paragraph_tuples, make_projective): @@ -263,6 +269,47 @@ class GoldCorpus(object): in zip(docs, paragraph_tuples)] +def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): + if random.random() >= orth_variant_level: + return raw, paragraph_tuples + variant_paragraph_tuples = [] + for sent_tuples, brackets in paragraph_tuples: + ids, words, tags, heads, labels, ner = sent_tuples + # single variants + ndsv = nlp.Defaults.single_orth_variants + punct_choices = [random.choice(x["variants"]) for x in ndsv] + for word_idx in range(len(words)): + for punct_idx in range(len(ndsv)): + if tags[word_idx] in ndsv[punct_idx]["tags"] \ + and words[word_idx] in ndsv[punct_idx]["variants"]: + words[word_idx] = punct_choices[punct_idx] + # paired variants + ndpv = nlp.Defaults.paired_orth_variants + punct_choices = [random.choice(x["variants"]) for x in ndpv] + for word_idx in range(len(words)): + for punct_idx in range(len(ndpv)): + if tags[word_idx] in ndpv[punct_idx]["tags"] \ + and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): + # backup option: random left vs. right from pair + pair_idx = random.choice([0, 1]) + # best option: rely on paired POS tags like `` / '' + if len(ndpv[punct_idx]["tags"]) == 2: + pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx]) + # next best option: rely on position in variants + # (may not be unambiguous, so order of variants matters) + else: + for pair in ndpv[punct_idx]["variants"]: + if words[word_idx] in pair: + pair_idx = pair.index(words[word_idx]) + words[word_idx] = punct_choices[punct_idx][pair_idx] + + variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner), brackets)) + if raw is not None: + # TODO: modify raw text accordingly + return raw, paragraph_tuples + return raw, variant_paragraph_tuples + + def add_noise(orig, noise_level): if random.random() >= noise_level: return orig diff --git a/spacy/language.py b/spacy/language.py index 86acf0257..0cf3528a2 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -108,6 +108,8 @@ class BaseDefaults(object): syntax_iterators = {} resources = {} writing_system = {"direction": "ltr", "has_case": True, "has_letters": True} + single_orth_variants = [] + paired_orth_variants = [] class Language(object): From 56c38484a1a3ca1625a455985a3272057823abbd Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Wed, 28 Aug 2019 09:16:40 +0200 Subject: [PATCH 105/207] Single and paired orth variants for English --- spacy/lang/en/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 7d00c749c..2f391de0b 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -38,6 +38,10 @@ class EnglishDefaults(Language.Defaults): "lemma_index": "lemmatizer/lemma_index.json", "lemma_exc": "lemmatizer/lemma_exc.json", } + single_orth_variants = [{"tags": ["NFP"], "variants": ["…", "..."]}, + {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}] + paired_orth_variants = [{"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]}, + {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]}] class English(Language): From 47af3f676e54138bd83a971be0679f3cf93ff9a7 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Wed, 28 Aug 2019 09:16:54 +0200 Subject: [PATCH 106/207] Single and paired orth variants for German --- spacy/lang/de/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index 1b5aee6a8..ae972072f 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -27,6 +27,10 @@ class GermanDefaults(Language.Defaults): stop_words = STOP_WORDS syntax_iterators = SYNTAX_ITERATORS resources = {"lemma_lookup": "lemma_lookup.json"} + single_orth_variants = [{"tags": ["$("], "variants": ["…", "..."]}, + {"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]}] + paired_orth_variants = [{"tags": ["$("], "variants": [("'", "'"), (",", "'"), ("‚", "‘")]}, + {"tags": ["$("], "variants": [("``", "''"), ('"', '"'), ("„", "“")]}] class German(Language): From 0a26e94d02d0417c512d6d151c81b138f1a22484 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Wed, 28 Aug 2019 13:38:54 +0200 Subject: [PATCH 107/207] Modify raw to match orth variant annotation tuples If raw is available, attempt to modify raw to match the orth variants. If raw/words can't be aligned, abort and return unmodified raw/annotation. --- spacy/gold.pyx | 56 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 7 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 1cd49814a..1028d831d 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -245,11 +245,12 @@ class GoldCorpus(object): @classmethod def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0, orth_variant_level=0.0): if raw_text is not None: + raw_text, paragraph_tuples = make_orth_variants(nlp, raw_text, paragraph_tuples, orth_variant_level=orth_variant_level) raw_text = add_noise(raw_text, noise_level) return [nlp.make_doc(raw_text)], paragraph_tuples else: docs = [] - raw_text, paragraph_tuples = make_orth_variants(nlp, None, paragraph_tuples, orth_variant_level) + raw_text, paragraph_tuples = make_orth_variants(nlp, None, paragraph_tuples, orth_variant_level=orth_variant_level) return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level)) for (sent_tuples, brackets) in paragraph_tuples], paragraph_tuples @@ -272,11 +273,13 @@ class GoldCorpus(object): def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): if random.random() >= orth_variant_level: return raw, paragraph_tuples + ndsv = nlp.Defaults.single_orth_variants + ndpv = nlp.Defaults.paired_orth_variants + # modify words in paragraph_tuples variant_paragraph_tuples = [] for sent_tuples, brackets in paragraph_tuples: ids, words, tags, heads, labels, ner = sent_tuples # single variants - ndsv = nlp.Defaults.single_orth_variants punct_choices = [random.choice(x["variants"]) for x in ndsv] for word_idx in range(len(words)): for punct_idx in range(len(ndsv)): @@ -284,7 +287,6 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): and words[word_idx] in ndsv[punct_idx]["variants"]: words[word_idx] = punct_choices[punct_idx] # paired variants - ndpv = nlp.Defaults.paired_orth_variants punct_choices = [random.choice(x["variants"]) for x in ndpv] for word_idx in range(len(words)): for punct_idx in range(len(ndpv)): @@ -304,10 +306,50 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): words[word_idx] = punct_choices[punct_idx][pair_idx] variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner), brackets)) - if raw is not None: - # TODO: modify raw text accordingly - return raw, paragraph_tuples - return raw, variant_paragraph_tuples + # modify raw to match variant_paragraph_tuples + if raw is not None: + variants = [] + for single_variants in ndsv: + variants.extend(single_variants["variants"]) + for paired_variants in ndpv: + variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"]))) + # store variants in reverse length order to be able to prioritize + # longer matches (e.g., "---" before "--") + variants = sorted(variants, key=lambda x: len(x)) + variants.reverse() + variant_raw = "" + raw_idx = 0 + # add initial whitespace + while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): + variant_raw += raw[raw_idx] + raw_idx += 1 + for sent_tuples, brackets in variant_paragraph_tuples: + ids, words, tags, heads, labels, ner = sent_tuples + for word in words: + match_found = False + # add identical word + if word not in variants and raw[raw_idx:].startswith(word): + variant_raw += word + raw_idx += len(word) + match_found = True + # add variant word + else: + for variant in variants: + if not match_found and \ + raw[raw_idx:].startswith(variant): + raw_idx += len(variant) + variant_raw += word + match_found = True + # something went wrong, abort + # (add a warning message?) + if not match_found: + return raw, paragraph_tuples + # add following whitespace + while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): + variant_raw += raw[raw_idx] + raw_idx += 1 + return variant_raw, variant_paragraph_tuples + return raw, variant_paragraph_tuples def add_noise(orig, noise_level): From 782056d11722a5b3eb352cfdf29a4b5deabd49a8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 28 Aug 2019 16:59:45 +0200 Subject: [PATCH 108/207] Fix morph rules --- spacy/lang/en/morph_rules.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/lang/en/morph_rules.py b/spacy/lang/en/morph_rules.py index b00534cc5..f910e42b8 100644 --- a/spacy/lang/en/morph_rules.py +++ b/spacy/lang/en/morph_rules.py @@ -73,10 +73,11 @@ _subordinating_conjunctions = [ "will", ] -_relative_pronouns = ["this", "that", "those", "these"] +# This seems kind of wrong too? +#_relative_pronouns = ["this", "that", "those", "these"] MORPH_RULES = { - "DT": {word: {"POS": "PRON"} for word in _relative_pronouns}, + #"DT": {word: {"POS": "PRON"} for word in _relative_pronouns}, "IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions}, "NN": { "something": {"POS": "PRON"}, From bc5ce498593a4f583225ddbf672d3703f1865928 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 28 Aug 2019 17:55:38 +0200 Subject: [PATCH 109/207] Fix 'noise_level' in train cmd --- spacy/cli/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index c4355f1a1..04e734068 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -240,7 +240,7 @@ def train( best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs( - nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0 + nlp, orth_variant_level=noise_level, gold_preproc=gold_preproc, max_length=0 ) if raw_text: random.shuffle(raw_text) From 7d6d4385663222b0f517d859db6be330c747132a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 28 Aug 2019 18:30:43 +0200 Subject: [PATCH 110/207] Set version to v2.2.0.dev2 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 6a9784c54..2ea1c5c9d 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # fmt: off __title__ = "spacy" -__version__ = "2.2.0.dev1" +__version__ = "2.2.0.dev2" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" From f3906950d3f17681bff3b1bede1b81d8ebfb1bec Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Thu, 29 Aug 2019 09:10:35 +0200 Subject: [PATCH 111/207] Add separate noise vs orth level to train CLI --- spacy/cli/train.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 04e734068..46f4b8900 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -65,6 +65,7 @@ from .. import about str, ), noise_level=("Amount of corruption for data augmentation", "option", "nl", float), + orth_variant_level=("Amount of orthography variation for data augmentation", "option", "ovl", float), eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str), gold_preproc=("Use gold preprocessing", "flag", "G", bool), learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool), @@ -90,6 +91,7 @@ def train( parser_multitasks="", entity_multitasks="", noise_level=0.0, + orth_variant_level=0.0, eval_beam_widths="", gold_preproc=False, learn_tokens=False, @@ -240,7 +242,7 @@ def train( best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs( - nlp, orth_variant_level=noise_level, gold_preproc=gold_preproc, max_length=0 + nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0 ) if raw_text: random.shuffle(raw_text) From 6511e1d8d328e5e50dc3bf103a8c12434468fd57 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 29 Aug 2019 14:33:07 +0200 Subject: [PATCH 112/207] Fix NER gold-standard around whitespace --- spacy/gold.pyx | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 6d784d1bd..b8ae2e505 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -635,7 +635,7 @@ cdef class GoldParse: self.tags[i] = "_SP" self.heads[i] = None self.labels[i] = None - self.ner[i] = "O" + self.ner[i] = None self.morphology[i] = set() if gold_i is None: if i in i2j_multi: @@ -686,9 +686,20 @@ cdef class GoldParse: self.labels[i] = deps[gold_i] self.ner[i] = entities[gold_i] + # Prevent whitespace that isn't within entities from being tagged as + # an entity. + for i in range(len(self.ner)): + if self.tags[i] == "_SP": + prev_ner = self.ner[i-1] if i >= 1 else None + next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None + if prev_ner == "O" or next_ner == "O": + self.ner[i] = "O" + cycle = nonproj.contains_cycle(self.heads) if cycle is not None: - raise ValueError(Errors.E069.format(cycle=cycle, cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]), doc_tokens=" ".join(words[:50]))) + raise ValueError(Errors.E069.format(cycle=cycle, + cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]), + doc_tokens=" ".join(words[:50]))) def __len__(self): """Get the number of gold-standard tokens. From 3c1c0ec18ec702afbe03e24519cdb0c3a513c945 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 29 Aug 2019 14:33:39 +0200 Subject: [PATCH 113/207] Add tests for NER oracle with whitespace --- spacy/tests/parser/test_ner.py | 66 ++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 43c00a963..c39491ecf 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -91,3 +91,69 @@ def test_doc_add_entities_set_ents_iob(en_vocab): assert [w.ent_iob_ for w in doc] == ["", "", "", "B"] doc.ents = [(doc.vocab.strings["WORD"], 0, 2)] assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""] + + +def test_oracle_moves_missing_B(en_vocab): + words = ["B", "52", "Bomber"] + biluo_tags = [None, None, "L-PRODUCT"] + + doc = Doc(en_vocab, words=words) + gold = GoldParse(doc, words=words, entities=biluo_tags) + + moves = BiluoPushDown(en_vocab.strings) + move_types = ("M", "B", "I", "L", "U", "O") + for tag in biluo_tags: + if tag is None: + continue + elif tag == "O": + moves.add_action(move_types.index("O"), "") + else: + action, label = tag.split("-") + moves.add_action(move_types.index("B"), label) + moves.add_action(move_types.index("I"), label) + moves.add_action(move_types.index("L"), label) + moves.add_action(move_types.index("U"), label) + moves.preprocess_gold(gold) + seq = moves.get_oracle_sequence(doc, gold) + print(seq) + + +def test_oracle_moves_whitespace(en_vocab): + words = [ + "production", + "\n", + "of", + "Northrop", + "\n", + "Corp.", + "\n", + "'s", + "radar", + ] + biluo_tags = [ + "O", + "O", + "O", + "B-ORG", + None, + "I-ORG", + "L-ORG", + "O", + "O", + ] + + doc = Doc(en_vocab, words=words) + gold = GoldParse(doc, words=words, entities=biluo_tags) + + moves = BiluoPushDown(en_vocab.strings) + move_types = ("M", "B", "I", "L", "U", "O") + for tag in biluo_tags: + if tag is None: + continue + elif tag == "O": + moves.add_action(move_types.index("O"), "") + else: + action, label = tag.split("-") + moves.add_action(move_types.index(action), label) + moves.preprocess_gold(gold) + seq = moves.get_oracle_sequence(doc, gold) From 32842a3cd45850201e067bc5d212e169aa14c045 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 29 Aug 2019 15:01:58 +0200 Subject: [PATCH 114/207] Disable whitespace corruption --- spacy/gold.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index b8ae2e505..b40bdb8a8 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -356,19 +356,19 @@ def add_noise(orig, noise_level): if random.random() >= noise_level: return orig elif type(orig) == list: - corrupted = [_corrupt(word, noise_level) for word in orig] + corrupted = [_corrupt(word, noise_level, replace_space=False) for word in orig] corrupted = [w for w in corrupted if w] return corrupted else: - return "".join(_corrupt(c, noise_level) for c in orig) + return "".join(_corrupt(c, noise_level, replace_space=False) for c in orig) -def _corrupt(c, noise_level): +def _corrupt(c, noise_level, replace_space=False): if random.random() >= noise_level: return c - elif c == " ": + elif replace_space and c == " ": return "\n" - elif c == "\n": + elif replace_space and c == "\n": return " " elif c in [".", "'", "!", "?", ","]: return "" From c94fc9edb9baf53b0e4a2f63bd561528045f6a4d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 29 Aug 2019 15:39:32 +0200 Subject: [PATCH 115/207] Fix noise addition --- spacy/gold.pyx | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index b40bdb8a8..af0588349 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -356,22 +356,18 @@ def add_noise(orig, noise_level): if random.random() >= noise_level: return orig elif type(orig) == list: - corrupted = [_corrupt(word, noise_level, replace_space=False) for word in orig] + corrupted = [_corrupt(word, noise_level) for word in orig] corrupted = [w for w in corrupted if w] return corrupted else: - return "".join(_corrupt(c, noise_level, replace_space=False) for c in orig) + return "".join(_corrupt(c, noise_level) for c in orig) -def _corrupt(c, noise_level, replace_space=False): +def _corrupt(c, noise_level): if random.random() >= noise_level: return c - elif replace_space and c == " ": - return "\n" - elif replace_space and c == "\n": - return " " elif c in [".", "'", "!", "?", ","]: - return "" + return "\n" else: return c.lower() From fc0a3c8c3877694c19ec5c4c5bab969e7ae2c93b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 29 Aug 2019 21:17:34 +0200 Subject: [PATCH 116/207] Add morphology serialization --- spacy/morphology.pyx | 45 ++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index ccfb214bc..a7a1bee57 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -15,6 +15,7 @@ from .parts_of_speech cimport SPACE from .parts_of_speech import IDS as POS_IDS from .lexeme cimport Lexeme from .errors import Errors +from .util import ensure_path cdef enum univ_field_t: @@ -162,12 +163,7 @@ cdef class Morphology: self.n_tags = len(tag_map) self.reverse_index = {} self._feat_map = MorphologyClassMap(FEATURES) - for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): - attrs = _normalize_props(attrs) - self.add({self._feat_map.id2feat[feat] for feat in attrs - if feat in self._feat_map.id2feat}) - self.tag_map[tag_str] = dict(attrs) - self.reverse_index[self.strings.add(tag_str)] = i + self._load_from_tag_map(tag_map) self._cache = PreshMapArray(self.n_tags) self.exc = {} @@ -177,6 +173,14 @@ cdef class Morphology: self.add_special_case( self.strings.as_string(tag), self.strings.as_string(orth), attrs) + def _load_from_tag_map(self, tag_map): + for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): + attrs = _normalize_props(attrs) + self.add({self._feat_map.id2feat[feat] for feat in attrs + if feat in self._feat_map.id2feat}) + self.tag_map[tag_str] = dict(attrs) + self.reverse_index[self.strings.add(tag_str)] = i + def __reduce__(self): return (Morphology, (self.strings, self.tag_map, self.lemmatizer, self.exc), None, None) @@ -188,6 +192,7 @@ cdef class Morphology: for f in features: if isinstance(f, basestring_): self.strings.add(f) + string_features = features features = intify_features(features) cdef attr_t feature for feature in features: @@ -321,22 +326,34 @@ cdef class Morphology: for form_str, attrs in entries.items(): self.add_special_case(tag_str, form_str, attrs) - def to_bytes(self): - json_tags = [] + def to_bytes(self, exclude=tuple(), **kwargs): + tag_map = {} for key in self.tags: tag_ptr = <MorphAnalysisC*>self.tags.get(key) if tag_ptr != NULL: - json_tags.append(tag_to_json(tag_ptr)) - return srsly.json_dumps(json_tags) + tag_map[key] = tag_to_json(tag_ptr) + exceptions = {} + for (tag_str, orth_int), attrs in sorted(self.exc.items()): + exceptions.setdefault(tag_str, {}) + exceptions[tag_str][self.strings[orth_int]] = attrs + data = {"tag_map": tag_map, "exceptions": exceptions} + return srsly.msgpack_dumps(data) def from_bytes(self, byte_string): - raise NotImplementedError + msg = srsly.msgpack_loads(byte_string) + self._load_from_tag_map(msg["tag_map"]) + self.load_morph_exceptions(msg["exceptions"]) + return self - def to_disk(self, path): - raise NotImplementedError + def to_disk(self, path, exclude=tuple(), **kwargs): + path = ensure_path(path) + with path.open("wb") as file_: + file_.write(self.to_bytes()) def from_disk(self, path): - raise NotImplementedError + with path.open("rb") as file_: + byte_string = file_.read() + return self.from_bytes(byte_string) @classmethod def create_class_map(cls): From f3c3ce7f1ec5fec87cb5965efe1c937f4666afea Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 29 Aug 2019 21:19:54 +0200 Subject: [PATCH 117/207] Update vocab --- spacy/vocab.pyx | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 02d5cbcff..b649fdded 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -433,6 +433,8 @@ cdef class Vocab: file_.write(self.lexemes_to_bytes()) if "vectors" not in "exclude" and self.vectors is not None: self.vectors.to_disk(path) + if "morphology" not in exclude: + self.morphology.to_disk(path / "morphology.bin") def from_disk(self, path, exclude=tuple(), **kwargs): """Loads state from a directory. Modifies the object in place and @@ -457,6 +459,8 @@ cdef class Vocab: self.vectors.from_disk(path, exclude=["strings"]) if self.vectors.name is not None: link_vectors_to_models(self) + if "morphology" not in exclude: + self.morphology.from_disk(path / "morphology.bin") return self def to_bytes(self, exclude=tuple(), **kwargs): @@ -476,7 +480,8 @@ cdef class Vocab: getters = OrderedDict(( ("strings", lambda: self.strings.to_bytes()), ("lexemes", lambda: self.lexemes_to_bytes()), - ("vectors", deserialize_vectors) + ("vectors", deserialize_vectors), + ("morphology", lambda: self.morphology.to_bytes()) )) exclude = util.get_serialization_exclude(getters, exclude, kwargs) return util.to_bytes(getters, exclude) @@ -499,7 +504,8 @@ cdef class Vocab: setters = OrderedDict(( ("strings", lambda b: self.strings.from_bytes(b)), ("lexemes", lambda b: self.lexemes_from_bytes(b)), - ("vectors", lambda b: serialize_vectors(b)) + ("vectors", lambda b: serialize_vectors(b)), + ("morphology", lambda b: self.morphology.from_bytes(b)) )) exclude = util.get_serialization_exclude(setters, exclude, kwargs) util.from_bytes(bytes_data, setters, exclude) From 02babf931793f4e2d372a6c89ef0ba3df9573140 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Fri, 30 Aug 2019 11:29:19 +0200 Subject: [PATCH 118/207] English tag map without unsupported features/values --- spacy/lang/en/tag_map.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py index 5c9a97786..9bd884a3a 100644 --- a/spacy/lang/en/tag_map.py +++ b/spacy/lang/en/tag_map.py @@ -14,8 +14,8 @@ TAG_MAP = { '""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, ":": {POS: PUNCT}, - "$": {POS: SYM, "Other": {"SymType": "currency"}}, - "#": {POS: SYM, "Other": {"SymType": "numbersign"}}, + "$": {POS: SYM}, + "#": {POS: SYM}, "AFX": {POS: ADJ, "Hyph": "yes"}, "CC": {POS: CCONJ, "ConjType": "comp"}, "CD": {POS: NUM, "NumType": "card"}, @@ -34,7 +34,7 @@ TAG_MAP = { "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"}, "NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"}, "NNS": {POS: NOUN, "Number": "plur"}, - "PDT": {POS: DET, "AdjType": "pdt", "PronType": "prn"}, + "PDT": {POS: DET}, "POS": {POS: PART, "Poss": "yes"}, "PRP": {POS: PRON, "PronType": "prs"}, "PRP$": {POS: PRON, "PronType": "prs", "Poss": "yes"}, @@ -58,10 +58,10 @@ TAG_MAP = { "Number": "sing", "Person": "three", }, - "WDT": {POS: PRON, "PronType": "int|rel"}, - "WP": {POS: PRON, "PronType": "int|rel"}, - "WP$": {POS: PRON, "Poss": "yes", "PronType": "int|rel"}, - "WRB": {POS: ADV, "PronType": "int|rel"}, + "WDT": {POS: PRON}, + "WP": {POS: PRON}, + "WP$": {POS: PRON, "Poss": "yes"}, + "WRB": {POS: ADV}, "ADD": {POS: X}, "NFP": {POS: PUNCT}, "GW": {POS: X}, From 893f11a9e38d4a29c608602e61798a29d4800d99 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Fri, 30 Aug 2019 11:30:03 +0200 Subject: [PATCH 119/207] Serialize tag_map directly Fix Aspect_prof typo --- spacy/morphology.pyx | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index a7a1bee57..f706fec7f 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -327,16 +327,11 @@ cdef class Morphology: self.add_special_case(tag_str, form_str, attrs) def to_bytes(self, exclude=tuple(), **kwargs): - tag_map = {} - for key in self.tags: - tag_ptr = <MorphAnalysisC*>self.tags.get(key) - if tag_ptr != NULL: - tag_map[key] = tag_to_json(tag_ptr) exceptions = {} for (tag_str, orth_int), attrs in sorted(self.exc.items()): exceptions.setdefault(tag_str, {}) exceptions[tag_str][self.strings[orth_int]] = attrs - data = {"tag_map": tag_map, "exceptions": exceptions} + data = {"tag_map": self.tag_map, "exceptions": exceptions} return srsly.msgpack_dumps(data) def from_bytes(self, byte_string): @@ -898,7 +893,7 @@ FEATURES = [ "Aspect_mod", "Aspect_none", "Aspect_perf", - "Aspect_prof", + "Aspect_prog", "Aspect_prosp", "Case_abe", "Case_abl", From 67c3d039055fd9a82bca67eda99c77284593f12f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 30 Aug 2019 13:13:07 +0200 Subject: [PATCH 120/207] Revert morphology serialisation --- spacy/morphology.pyx | 24 ------------------------ spacy/vocab.pyx | 6 ------ 2 files changed, 30 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index f706fec7f..2d58b8f27 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -326,30 +326,6 @@ cdef class Morphology: for form_str, attrs in entries.items(): self.add_special_case(tag_str, form_str, attrs) - def to_bytes(self, exclude=tuple(), **kwargs): - exceptions = {} - for (tag_str, orth_int), attrs in sorted(self.exc.items()): - exceptions.setdefault(tag_str, {}) - exceptions[tag_str][self.strings[orth_int]] = attrs - data = {"tag_map": self.tag_map, "exceptions": exceptions} - return srsly.msgpack_dumps(data) - - def from_bytes(self, byte_string): - msg = srsly.msgpack_loads(byte_string) - self._load_from_tag_map(msg["tag_map"]) - self.load_morph_exceptions(msg["exceptions"]) - return self - - def to_disk(self, path, exclude=tuple(), **kwargs): - path = ensure_path(path) - with path.open("wb") as file_: - file_.write(self.to_bytes()) - - def from_disk(self, path): - with path.open("rb") as file_: - byte_string = file_.read() - return self.from_bytes(byte_string) - @classmethod def create_class_map(cls): return MorphologyClassMap(FEATURES) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index b649fdded..35d9374d0 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -433,8 +433,6 @@ cdef class Vocab: file_.write(self.lexemes_to_bytes()) if "vectors" not in "exclude" and self.vectors is not None: self.vectors.to_disk(path) - if "morphology" not in exclude: - self.morphology.to_disk(path / "morphology.bin") def from_disk(self, path, exclude=tuple(), **kwargs): """Loads state from a directory. Modifies the object in place and @@ -459,8 +457,6 @@ cdef class Vocab: self.vectors.from_disk(path, exclude=["strings"]) if self.vectors.name is not None: link_vectors_to_models(self) - if "morphology" not in exclude: - self.morphology.from_disk(path / "morphology.bin") return self def to_bytes(self, exclude=tuple(), **kwargs): @@ -481,7 +477,6 @@ cdef class Vocab: ("strings", lambda: self.strings.to_bytes()), ("lexemes", lambda: self.lexemes_to_bytes()), ("vectors", deserialize_vectors), - ("morphology", lambda: self.morphology.to_bytes()) )) exclude = util.get_serialization_exclude(getters, exclude, kwargs) return util.to_bytes(getters, exclude) @@ -505,7 +500,6 @@ cdef class Vocab: ("strings", lambda b: self.strings.from_bytes(b)), ("lexemes", lambda b: self.lexemes_from_bytes(b)), ("vectors", lambda b: serialize_vectors(b)), - ("morphology", lambda b: self.morphology.from_bytes(b)) )) exclude = util.get_serialization_exclude(setters, exclude, kwargs) util.from_bytes(bytes_data, setters, exclude) From c39c13f26b119ad3fcac3b3d3669c3f28a7d5504 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Wed, 4 Sep 2019 20:05:08 +0200 Subject: [PATCH 121/207] Add guillemets/chevrons to German orth variants Add guillemets/chevrons to German orth variants for both German/Austrian and Swiss conventions. --- spacy/lang/de/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index ae972072f..1ddee54b3 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -29,8 +29,8 @@ class GermanDefaults(Language.Defaults): resources = {"lemma_lookup": "lemma_lookup.json"} single_orth_variants = [{"tags": ["$("], "variants": ["…", "..."]}, {"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]}] - paired_orth_variants = [{"tags": ["$("], "variants": [("'", "'"), (",", "'"), ("‚", "‘")]}, - {"tags": ["$("], "variants": [("``", "''"), ('"', '"'), ("„", "“")]}] + paired_orth_variants = [{"tags": ["$("], "variants": [("'", "'"), (",", "'"), ("‚", "‘"), ("›", "‹"), ("‹", "›")]}, + {"tags": ["$("], "variants": [("``", "''"), ('"', '"'), ("„", "“"), ("»", "«"), ("«", "»")]}] class German(Language): From fde4f8ac8e5f9678b8fe39efac2d94c5304dca54 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sun, 8 Sep 2019 18:08:09 +0200 Subject: [PATCH 122/207] Create lookups if not passed in --- spacy/language.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/language.py b/spacy/language.py index c10718e80..6a02a1fdd 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -39,6 +39,8 @@ from . import about class BaseDefaults(object): @classmethod def create_lemmatizer(cls, nlp=None, lookups=None): + if lookups is None: + lookups = cls.create_lookups(nlp=nlp) lemma_rules, lemma_index, lemma_exc, lemma_lookup = util.get_lemma_tables(lookups) return Lemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup) From aec6174ae6c032ca85d0ca10f2703f1fffa85cd1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sun, 8 Sep 2019 18:09:53 +0200 Subject: [PATCH 123/207] Fix lemmatizer --- spacy/lemmatizer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index f7a58aa9f..c9ccbcd0d 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -55,6 +55,8 @@ class Lemmatizer(object): Check whether we're dealing with an uninflected paradigm, so we can avoid lemmatization entirely. """ + if morphology is None: + morphology = {} if univ_pos == "noun" and morphology.get("Number") == "sing": return True elif univ_pos == "verb" and morphology.get("VerbForm") == "inf": From da8830d909e56344fc9a02d2548d66a886fbb70f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sun, 8 Sep 2019 18:22:03 +0200 Subject: [PATCH 124/207] Set version to v2.2.0.dev3 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 2ea1c5c9d..ed4781af1 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # fmt: off __title__ = "spacy" -__version__ = "2.2.0.dev2" +__version__ = "2.2.0.dev3" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" From 1653b818c5e8a7da0efb61ae500481661a141241 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sun, 8 Sep 2019 20:57:58 +0200 Subject: [PATCH 125/207] Update Lithuanian tag map --- spacy/lang/lt/tag_map.py | 236 +++++++++++++++++++-------------------- 1 file changed, 118 insertions(+), 118 deletions(-) diff --git a/spacy/lang/lt/tag_map.py b/spacy/lang/lt/tag_map.py index eab231b2c..6ea4f8ae0 100644 --- a/spacy/lang/lt/tag_map.py +++ b/spacy/lang/lt/tag_map.py @@ -1605,7 +1605,7 @@ TAG_MAP = { POS: VERB, "Mood": "Imp", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Pos", "VerbForm": "Fin", }, @@ -1613,7 +1613,7 @@ TAG_MAP = { POS: VERB, "Mood": "Cnd", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Pos", "VerbForm": "Fin", }, @@ -1621,7 +1621,7 @@ TAG_MAP = { POS: VERB, "Mood": "Imp", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Reflex": "Yes", "VerbForm": "Fin", @@ -1630,7 +1630,7 @@ TAG_MAP = { POS: VERB, "Mood": "Imp", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Neg", "VerbForm": "Fin", }, @@ -1638,7 +1638,7 @@ TAG_MAP = { POS: VERB, "Mood": "Cnd", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Neg", "Reflex": "Yes", "VerbForm": "Fin", @@ -1647,7 +1647,7 @@ TAG_MAP = { POS: VERB, "Mood": "Cnd", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", "VerbForm": "Fin", }, @@ -1655,7 +1655,7 @@ TAG_MAP = { POS: VERB, "Mood": "Cnd", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Reflex": "Yes", "VerbForm": "Fin", @@ -1664,7 +1664,7 @@ TAG_MAP = { POS: VERB, "Mood": "Cnd", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Neg", "VerbForm": "Fin", }, @@ -1672,7 +1672,7 @@ TAG_MAP = { POS: VERB, "Mood": "Cnd", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Neg", "Reflex": "Yes", "VerbForm": "Fin", @@ -1681,7 +1681,7 @@ TAG_MAP = { POS: VERB, "Mood": "Imp", "Number": "Plur", - "Person": "2", + "Person": "two", "Polarity": "Pos", "VerbForm": "Fin", }, @@ -1689,7 +1689,7 @@ TAG_MAP = { POS: VERB, "Mood": "Cnd", "Number": "Plur", - "Person": "2", + "Person": "two", "Polarity": "Pos", "VerbForm": "Fin", }, @@ -1697,7 +1697,7 @@ TAG_MAP = { POS: VERB, "Mood": "Imp", "Number": "Plur", - "Person": "2", + "Person": "two", "Polarity": "Pos", "Reflex": "Yes", "VerbForm": "Fin", @@ -1706,7 +1706,7 @@ TAG_MAP = { POS: VERB, "Mood": "Imp", "Number": "Plur", - "Person": "2", + "Person": "two", "Polarity": "Neg", "VerbForm": "Fin", }, @@ -1714,7 +1714,7 @@ TAG_MAP = { POS: VERB, "Mood": "Imp", "Number": "Plur", - "Person": "2", + "Person": "two", "Polarity": "Neg", "Reflex": "Yes", "VerbForm": "Fin", @@ -1723,7 +1723,7 @@ TAG_MAP = { POS: VERB, "Mood": "Imp", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Pos", "VerbForm": "Fin", }, @@ -1731,7 +1731,7 @@ TAG_MAP = { POS: VERB, "Mood": "Cnd", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Pos", "VerbForm": "Fin", }, @@ -1739,7 +1739,7 @@ TAG_MAP = { POS: VERB, "Mood": "Imp", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Pos", "Reflex": "Yes", "VerbForm": "Fin", @@ -1748,7 +1748,7 @@ TAG_MAP = { POS: VERB, "Mood": "Imp", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Neg", "VerbForm": "Fin", }, @@ -1756,21 +1756,21 @@ TAG_MAP = { POS: VERB, "Mood": "Cnd", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Neg", "VerbForm": "Fin", }, "Vgm-3---n--ns-": { POS: VERB, "Mood": "Cnd", - "Person": "3", + "Person": "three", "Polarity": "Pos", "VerbForm": "Fin", }, "Vgm-3---n--ys-": { POS: VERB, "Mood": "Cnd", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Reflex": "Yes", "VerbForm": "Fin", @@ -1778,14 +1778,14 @@ TAG_MAP = { "Vgm-3---y--ns-": { POS: VERB, "Mood": "Cnd", - "Person": "3", + "Person": "three", "Polarity": "Neg", "VerbForm": "Fin", }, "Vgm-3---y--ys-": { POS: VERB, "Mood": "Cnd", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Reflex": "Yes", "VerbForm": "Fin", @@ -1794,7 +1794,7 @@ TAG_MAP = { POS: VERB, "Mood": "Cnd", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Pos", "VerbForm": "Fin", }, @@ -1802,7 +1802,7 @@ TAG_MAP = { POS: VERB, "Mood": "Cnd", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Reflex": "Yes", "VerbForm": "Fin", @@ -1811,7 +1811,7 @@ TAG_MAP = { POS: VERB, "Mood": "Cnd", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Neg", "VerbForm": "Fin", }, @@ -1819,7 +1819,7 @@ TAG_MAP = { POS: VERB, "Mood": "Cnd", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", "VerbForm": "Fin", }, @@ -1827,7 +1827,7 @@ TAG_MAP = { POS: VERB, "Mood": "Cnd", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Reflex": "Yes", "VerbForm": "Fin", @@ -1836,7 +1836,7 @@ TAG_MAP = { POS: VERB, "Mood": "Cnd", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Neg", "VerbForm": "Fin", }, @@ -1844,7 +1844,7 @@ TAG_MAP = { POS: VERB, "Mood": "Cnd", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Reflex": "Yes", "VerbForm": "Fin", @@ -1853,7 +1853,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Tense": "Past", "VerbForm": "Fin", @@ -1862,7 +1862,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Past", @@ -1872,7 +1872,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Neg", "Tense": "Past", "VerbForm": "Fin", @@ -1881,7 +1881,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Neg", "Reflex": "Yes", "Tense": "Past", @@ -1891,7 +1891,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Tense": "Past", "VerbForm": "Fin", @@ -1900,7 +1900,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Past", @@ -1910,7 +1910,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Neg", "Tense": "Past", "VerbForm": "Fin", @@ -1919,7 +1919,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Neg", "Reflex": "Yes", "Tense": "Past", @@ -1929,7 +1929,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "2", + "Person": "two", "Polarity": "Pos", "Tense": "Past", "VerbForm": "Fin", @@ -1938,7 +1938,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "2", + "Person": "two", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Past", @@ -1948,7 +1948,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "2", + "Person": "two", "Polarity": "Neg", "Tense": "Past", "VerbForm": "Fin", @@ -1957,7 +1957,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Pos", "Tense": "Past", "VerbForm": "Fin", @@ -1966,7 +1966,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Neg", "Tense": "Past", "VerbForm": "Fin", @@ -1974,7 +1974,7 @@ TAG_MAP = { "Vgma3---n--ni-": { POS: VERB, "Mood": "Ind", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Tense": "Past", "VerbForm": "Fin", @@ -1982,7 +1982,7 @@ TAG_MAP = { "Vgma3---n--yi-": { POS: VERB, "Mood": "Ind", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Past", @@ -1991,7 +1991,7 @@ TAG_MAP = { "Vgma3---y--ni-": { POS: VERB, "Mood": "Ind", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Tense": "Past", "VerbForm": "Fin", @@ -1999,7 +1999,7 @@ TAG_MAP = { "Vgma3--y--ni-": { POS: VERB, "Case": "Nom", - "Person": "3", + "Person": "three", "Tense": "Past", "VerbForm": "Fin", }, @@ -2007,7 +2007,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Tense": "Past", "VerbForm": "Fin", @@ -2016,7 +2016,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Past", @@ -2026,7 +2026,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Tense": "Past", "VerbForm": "Fin", @@ -2035,7 +2035,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Reflex": "Yes", "Tense": "Past", @@ -2045,7 +2045,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Tense": "Past", "VerbForm": "Fin", @@ -2054,7 +2054,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Past", @@ -2064,7 +2064,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Past", @@ -2074,7 +2074,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Tense": "Past", "VerbForm": "Fin", @@ -2083,7 +2083,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Reflex": "Yes", "Tense": "Past", @@ -2093,7 +2093,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Tense": "Fut", "VerbForm": "Fin", @@ -2102,7 +2102,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Fut", @@ -2112,7 +2112,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Neg", "Tense": "Fut", "VerbForm": "Fin", @@ -2121,7 +2121,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Tense": "Fut", "VerbForm": "Fin", @@ -2130,7 +2130,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Fut", @@ -2140,7 +2140,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Neg", "Tense": "Fut", "VerbForm": "Fin", @@ -2149,7 +2149,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "2", + "Person": "two", "Polarity": "Pos", "Tense": "Fut", "VerbForm": "Fin", @@ -2158,7 +2158,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "2", + "Person": "two", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Fut", @@ -2168,7 +2168,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Pos", "Tense": "Fut", "VerbForm": "Fin", @@ -2177,7 +2177,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Fut", @@ -2187,7 +2187,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Neg", "Tense": "Fut", "VerbForm": "Fin", @@ -2196,7 +2196,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Neg", "Reflex": "Yes", "Tense": "Fut", @@ -2205,7 +2205,7 @@ TAG_MAP = { "Vgmf3---n--ni-": { POS: VERB, "Mood": "Ind", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Tense": "Fut", "VerbForm": "Fin", @@ -2213,7 +2213,7 @@ TAG_MAP = { "Vgmf3---y--ni-": { POS: VERB, "Mood": "Ind", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Tense": "Fut", "VerbForm": "Fin", @@ -2222,7 +2222,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Tense": "Fut", "VerbForm": "Fin", @@ -2231,7 +2231,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Fut", @@ -2241,7 +2241,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Tense": "Fut", "VerbForm": "Fin", @@ -2250,7 +2250,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Tense": "Fut", "VerbForm": "Fin", @@ -2259,7 +2259,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Fut", @@ -2269,7 +2269,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Tense": "Fut", "VerbForm": "Fin", @@ -2278,7 +2278,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Reflex": "Yes", "Tense": "Fut", @@ -2288,7 +2288,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Tense": "Pres", "VerbForm": "Fin", @@ -2297,7 +2297,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Pres", @@ -2307,7 +2307,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Neg", "Tense": "Pres", "VerbForm": "Fin", @@ -2316,7 +2316,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Neg", "Reflex": "Yes", "Tense": "Pres", @@ -2326,7 +2326,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Tense": "Pres", "VerbForm": "Fin", @@ -2335,7 +2335,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Tense": "Pres", "VerbForm": "Fin", @@ -2344,7 +2344,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Pres", @@ -2354,7 +2354,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Neg", "Tense": "Pres", "VerbForm": "Fin", @@ -2363,7 +2363,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Neg", "Reflex": "Yes", "Tense": "Pres", @@ -2373,7 +2373,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "2", + "Person": "two", "Polarity": "Pos", "Tense": "Pres", "VerbForm": "Fin", @@ -2382,7 +2382,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "2", + "Person": "two", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Pres", @@ -2392,7 +2392,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "2", + "Person": "two", "Polarity": "Neg", "Tense": "Pres", "VerbForm": "Fin", @@ -2401,7 +2401,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "2", + "Person": "two", "Polarity": "Neg", "Reflex": "Yes", "Tense": "Pres", @@ -2411,7 +2411,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Pos", "Tense": "Pres", "VerbForm": "Fin", @@ -2420,7 +2420,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Pres", @@ -2430,7 +2430,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Neg", "Tense": "Pres", "VerbForm": "Fin", @@ -2438,7 +2438,7 @@ TAG_MAP = { "Vgmp3---n--ni-": { POS: VERB, "Mood": "Ind", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Tense": "Pres", "VerbForm": "Fin", @@ -2446,7 +2446,7 @@ TAG_MAP = { "Vgmp3---n--yi-": { POS: VERB, "Mood": "Ind", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Pres", @@ -2455,7 +2455,7 @@ TAG_MAP = { "Vgmp3---y--ni-": { POS: VERB, "Mood": "Ind", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Tense": "Pres", "VerbForm": "Fin", @@ -2463,7 +2463,7 @@ TAG_MAP = { "Vgmp3---y--yi-": { POS: VERB, "Mood": "Ind", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Reflex": "Yes", "Tense": "Pres", @@ -2473,7 +2473,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Tense": "Pres", "VerbForm": "Fin", @@ -2482,7 +2482,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Pres", @@ -2492,7 +2492,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Tense": "Pres", "VerbForm": "Fin", @@ -2501,7 +2501,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Reflex": "Yes", "Tense": "Pres", @@ -2511,7 +2511,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Tense": "Pres", "VerbForm": "Fin", @@ -2520,7 +2520,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Tense": "Pres", "VerbForm": "Fin", @@ -2529,7 +2529,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Tense": "Pres", "VerbForm": "Fin", @@ -2538,7 +2538,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Pres", @@ -2548,7 +2548,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Tense": "Pres", "VerbForm": "Fin", @@ -2557,7 +2557,7 @@ TAG_MAP = { POS: VERB, "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Reflex": "Yes", "Tense": "Pres", @@ -2568,7 +2568,7 @@ TAG_MAP = { "Aspect": "Hab", "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Tense": "Past", "VerbForm": "Fin", @@ -2578,7 +2578,7 @@ TAG_MAP = { "Aspect": "Hab", "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Past", @@ -2589,7 +2589,7 @@ TAG_MAP = { "Aspect": "Hab", "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Neg", "Tense": "Past", "VerbForm": "Fin", @@ -2599,7 +2599,7 @@ TAG_MAP = { "Aspect": "Hab", "Mood": "Ind", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Pos", "Tense": "Past", "VerbForm": "Fin", @@ -2608,7 +2608,7 @@ TAG_MAP = { POS: VERB, "Aspect": "Hab", "Mood": "Ind", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Tense": "Past", "VerbForm": "Fin", @@ -2618,7 +2618,7 @@ TAG_MAP = { "Aspect": "Hab", "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Tense": "Past", "VerbForm": "Fin", @@ -2628,7 +2628,7 @@ TAG_MAP = { "Aspect": "Hab", "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Past", @@ -2639,7 +2639,7 @@ TAG_MAP = { "Aspect": "Hab", "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Tense": "Past", "VerbForm": "Fin", @@ -2649,7 +2649,7 @@ TAG_MAP = { "Aspect": "Hab", "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Reflex": "Yes", "Tense": "Past", @@ -2660,7 +2660,7 @@ TAG_MAP = { "Aspect": "Hab", "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Tense": "Past", "VerbForm": "Fin", @@ -2670,7 +2670,7 @@ TAG_MAP = { "Aspect": "Perf", "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Tense": "Past", "VerbForm": "Fin", From 28741ff5db7927d29fa498a6016249c74e21ecce Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 10 Sep 2019 19:13:07 +0200 Subject: [PATCH 126/207] Require preshed v3.0.0 --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 865288a86..8fea3253b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ # Our libraries cymem>=2.0.2,<2.1.0 -preshed>=2.0.1,<2.1.0 +preshed>=3.0.0,<3.1.0 thinc>=7.1.0,<7.2.0 blis>=0.4.0,<0.5.0 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.py b/setup.py index 29bdb96fa..ee60aa07f 100755 --- a/setup.py +++ b/setup.py @@ -247,7 +247,7 @@ def setup_package(): "numpy>=1.15.0", "murmurhash>=0.28.0,<1.1.0", "cymem>=2.0.2,<2.1.0", - "preshed>=2.0.1,<2.1.0", + "preshed>=3.0.0,<3.1.0", "thinc>=7.1.0,<7.2.0", "blis>=0.4.0,<0.5.0", "plac<1.0.0,>=0.9.6", From c181a94e7540d2c0155a41c508a87398f0973eb6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Tue, 10 Sep 2019 20:12:24 +0200 Subject: [PATCH 127/207] Require thinc 7.1.1 --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8fea3253b..5c3ed9981 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.0,<3.1.0 -thinc>=7.1.0,<7.2.0 +thinc>=7.1.1,<7.2.0 blis>=0.4.0,<0.5.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.2.0,<1.1.0 diff --git a/setup.py b/setup.py index ee60aa07f..f37c8783c 100755 --- a/setup.py +++ b/setup.py @@ -248,7 +248,7 @@ def setup_package(): "murmurhash>=0.28.0,<1.1.0", "cymem>=2.0.2,<2.1.0", "preshed>=3.0.0,<3.1.0", - "thinc>=7.1.0,<7.2.0", + "thinc>=7.1.1,<7.2.0", "blis>=0.4.0,<0.5.0", "plac<1.0.0,>=0.9.6", "requests>=2.13.0,<3.0.0", From 178d010b25c0dd438a85565e452dca31660efc11 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 11 Sep 2019 12:28:37 +0200 Subject: [PATCH 128/207] Set version to 2.2.0.dev4 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index ed4781af1..944bcae77 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # fmt: off __title__ = "spacy" -__version__ = "2.2.0.dev3" +__version__ = "2.2.0.dev4" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" From af253236534a69ccdef1428cb0d8b6b7461e271c Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Wed, 11 Sep 2019 14:00:36 +0200 Subject: [PATCH 129/207] Tidy up and auto-format --- spacy/_ml.py | 50 ++++++++++++--------------- spacy/cli/train.py | 13 +++++-- spacy/errors.py | 1 + spacy/lang/de/__init__.py | 18 +++++++--- spacy/lang/en/__init__.py | 12 ++++--- spacy/lang/en/morph_rules.py | 48 ++++++++++++------------- spacy/lang/en/tokenizer_exceptions.py | 7 +--- spacy/lemmatizer.py | 11 +++--- spacy/lookups.py | 1 + spacy/tests/parser/test_ner.py | 26 ++------------ 10 files changed, 90 insertions(+), 97 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 97660f8f9..d81ceccc1 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -348,7 +348,7 @@ def Tok2Vec(width, embed_size, **kwargs): if pretrained_vectors is not None: glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID)) - if subword_features: + if subword_features: embed = uniqued( (glove | norm | prefix | suffix | shape) >> LN(Maxout(width, width * 5, pieces=3)), @@ -363,14 +363,16 @@ def Tok2Vec(width, embed_size, **kwargs): embed = uniqued( (norm | prefix | suffix | shape) >> LN(Maxout(width, width * 4, pieces=3)), - column=cols.index(ORTH) + column=cols.index(ORTH), ) - elif char_embed: + elif char_embed: embed = concatenate_lists( CharacterEmbed(nM=64, nC=8), - FeatureExtracter(cols) >> with_flatten(norm) + FeatureExtracter(cols) >> with_flatten(norm), + ) + reduce_dimensions = LN( + Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces) ) - reduce_dimensions = LN(Maxout(width, 64*8+width, pieces=cnn_maxout_pieces)) else: embed = norm @@ -379,22 +381,14 @@ def Tok2Vec(width, embed_size, **kwargs): >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces)) ) if char_embed: - tok2vec = ( - embed - >> with_flatten( - reduce_dimensions - >> convolution ** conv_depth, pad=conv_depth - ) + tok2vec = embed >> with_flatten( + reduce_dimensions >> convolution ** conv_depth, pad=conv_depth ) else: - tok2vec = ( - FeatureExtracter(cols) - >> with_flatten( - embed - >> convolution ** conv_depth, pad=conv_depth - ) + tok2vec = FeatureExtracter(cols) >> with_flatten( + embed >> convolution ** conv_depth, pad=conv_depth ) - + if bilstm_depth >= 1: tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth) # Work around thinc API limitations :(. TODO: Revise in Thinc 7 @@ -611,9 +605,7 @@ def build_morphologizer_model(class_nums, **cfg): char_embed=char_embed, pretrained_vectors=pretrained_vectors, ) - softmax = with_flatten( - MultiSoftmax(class_nums, token_vector_width) - ) + softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width)) softmax.out_sizes = class_nums model = tok2vec >> softmax model.nI = None @@ -906,16 +898,17 @@ def _replace_word(word, random_words, mask="[MASK]"): def _uniform_init(lo, hi): def wrapped(W, ops): copy_array(W, ops.xp.random.uniform(lo, hi, W.shape)) + return wrapped @describe.attributes( nM=Dimension("Vector dimensions"), nC=Dimension("Number of characters per word"), - vectors=Synapses("Embed matrix", - lambda obj: (obj.nC, obj.nV, obj.nM), - _uniform_init(-0.1, 0.1)), - d_vectors=Gradient("vectors") + vectors=Synapses( + "Embed matrix", lambda obj: (obj.nC, obj.nV, obj.nM), _uniform_init(-0.1, 0.1) + ), + d_vectors=Gradient("vectors"), ) class CharacterEmbed(Model): def __init__(self, nM=None, nC=None, **kwargs): @@ -926,12 +919,12 @@ class CharacterEmbed(Model): @property def nO(self): return self.nM * self.nC - + @property def nV(self): return 256 - def begin_update(self, docs, drop=0.): + def begin_update(self, docs, drop=0.0): if not docs: return [] ids = [] @@ -959,6 +952,7 @@ class CharacterEmbed(Model): if sgd is not None: sgd(self._mem.weights, self._mem.gradient, key=self.id) return None + return output, backprop_character_embed @@ -974,4 +968,4 @@ def get_cossim_loss(yh, y): cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2)) loss = xp.abs(cosine - 1).sum() - return loss, -d_yh \ No newline at end of file + return loss, -d_yh diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 365e7ea44..8d162362c 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -64,7 +64,12 @@ from .. import about str, ), noise_level=("Amount of corruption for data augmentation", "option", "nl", float), - orth_variant_level=("Amount of orthography variation for data augmentation", "option", "ovl", float), + orth_variant_level=( + "Amount of orthography variation for data augmentation", + "option", + "ovl", + float, + ), eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str), gold_preproc=("Use gold preprocessing", "flag", "G", bool), learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool), @@ -245,7 +250,11 @@ def train( best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs( - nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0 + nlp, + noise_level=noise_level, + orth_variant_level=orth_variant_level, + gold_preproc=gold_preproc, + max_length=0, ) if raw_text: random.shuffle(raw_text) diff --git a/spacy/errors.py b/spacy/errors.py index c0868800d..b8a8dccba 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -456,6 +456,7 @@ class Errors(object): E159 = ("Can't find table '{name}' in lookups. Available tables: {tables}") E160 = ("Can't find language data file: {path}") + @add_codes class TempErrors(object): T003 = ("Resizing pre-trained Tagger models is not currently supported.") diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index 1ddee54b3..b96069235 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -27,10 +27,20 @@ class GermanDefaults(Language.Defaults): stop_words = STOP_WORDS syntax_iterators = SYNTAX_ITERATORS resources = {"lemma_lookup": "lemma_lookup.json"} - single_orth_variants = [{"tags": ["$("], "variants": ["…", "..."]}, - {"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]}] - paired_orth_variants = [{"tags": ["$("], "variants": [("'", "'"), (",", "'"), ("‚", "‘"), ("›", "‹"), ("‹", "›")]}, - {"tags": ["$("], "variants": [("``", "''"), ('"', '"'), ("„", "“"), ("»", "«"), ("«", "»")]}] + single_orth_variants = [ + {"tags": ["$("], "variants": ["…", "..."]}, + {"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]}, + ] + paired_orth_variants = [ + { + "tags": ["$("], + "variants": [("'", "'"), (",", "'"), ("‚", "‘"), ("›", "‹"), ("‹", "›")], + }, + { + "tags": ["$("], + "variants": [("``", "''"), ('"', '"'), ("„", "“"), ("»", "«"), ("«", "»")], + }, + ] class German(Language): diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 2f391de0b..e4c745c83 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -38,10 +38,14 @@ class EnglishDefaults(Language.Defaults): "lemma_index": "lemmatizer/lemma_index.json", "lemma_exc": "lemmatizer/lemma_exc.json", } - single_orth_variants = [{"tags": ["NFP"], "variants": ["…", "..."]}, - {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}] - paired_orth_variants = [{"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]}, - {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]}] + single_orth_variants = [ + {"tags": ["NFP"], "variants": ["…", "..."]}, + {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}, + ] + paired_orth_variants = [ + {"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]}, + {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]}, + ] class English(Language): diff --git a/spacy/lang/en/morph_rules.py b/spacy/lang/en/morph_rules.py index f910e42b8..5ed4eac59 100644 --- a/spacy/lang/en/morph_rules.py +++ b/spacy/lang/en/morph_rules.py @@ -12,50 +12,50 @@ _subordinating_conjunctions = [ "if", "as", "because", - #"of", - #"for", - #"before", - #"in", + # "of", + # "for", + # "before", + # "in", "while", - #"after", + # "after", "since", "like", - #"with", + # "with", "so", - #"to", - #"by", - #"on", - #"about", + # "to", + # "by", + # "on", + # "about", "than", "whether", "although", - #"from", + # "from", "though", - #"until", + # "until", "unless", "once", - #"without", - #"at", - #"into", + # "without", + # "at", + # "into", "cause", - #"over", + # "over", "upon", "till", "whereas", - #"beyond", + # "beyond", "whilst", "except", "despite", "wether", - #"then", + # "then", "but", "becuse", "whie", - #"below", - #"against", + # "below", + # "against", "it", "w/out", - #"toward", + # "toward", "albeit", "save", "besides", @@ -67,17 +67,17 @@ _subordinating_conjunctions = [ "out", "near", "seince", - #"towards", + # "towards", "tho", "sice", "will", ] # This seems kind of wrong too? -#_relative_pronouns = ["this", "that", "those", "these"] +# _relative_pronouns = ["this", "that", "those", "these"] MORPH_RULES = { - #"DT": {word: {"POS": "PRON"} for word in _relative_pronouns}, + # "DT": {word: {"POS": "PRON"} for word in _relative_pronouns}, "IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions}, "NN": { "something": {"POS": "PRON"}, diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index 91c29c9e0..c45197771 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -30,12 +30,7 @@ for pron in ["i"]: for orth in [pron, pron.title()]: _exc[orth + "'m"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - { - ORTH: "'m", - LEMMA: "be", - NORM: "am", - TAG: "VBP", - }, + {ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP"}, ] _exc[orth + "m"] = [ diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index c9ccbcd0d..d14f5292e 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -2,8 +2,7 @@ from __future__ import unicode_literals from collections import OrderedDict -from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN -from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos +from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN class Lemmatizer(object): @@ -71,13 +70,13 @@ class Lemmatizer(object): return True elif univ_pos == "adj" and morphology.get("Degree") == "pos": return True - elif morphology.get('VerbForm') == 'inf': + elif morphology.get("VerbForm") == "inf": return True - elif morphology.get('VerbForm') == 'none': + elif morphology.get("VerbForm") == "none": return True - elif morphology.get('VerbForm') == 'inf': + elif morphology.get("VerbForm") == "inf": return True - elif morphology.get('Degree') == 'pos': + elif morphology.get("Degree") == "pos": return True else: return False diff --git a/spacy/lookups.py b/spacy/lookups.py index 801b4d00d..741d40330 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -137,6 +137,7 @@ class Table(OrderedDict): """A table in the lookups. Subclass of builtin dict that implements a slightly more consistent and unified API. """ + @classmethod def from_dict(cls, data, name=None): self = cls(name=name) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index c39491ecf..db911dba0 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -119,28 +119,8 @@ def test_oracle_moves_missing_B(en_vocab): def test_oracle_moves_whitespace(en_vocab): - words = [ - "production", - "\n", - "of", - "Northrop", - "\n", - "Corp.", - "\n", - "'s", - "radar", - ] - biluo_tags = [ - "O", - "O", - "O", - "B-ORG", - None, - "I-ORG", - "L-ORG", - "O", - "O", - ] + words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"] + biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"] doc = Doc(en_vocab, words=words) gold = GoldParse(doc, words=words, entities=biluo_tags) @@ -156,4 +136,4 @@ def test_oracle_moves_whitespace(en_vocab): action, label = tag.split("-") moves.add_action(move_types.index(action), label) moves.preprocess_gold(gold) - seq = moves.get_oracle_sequence(doc, gold) + moves.get_oracle_sequence(doc, gold) From c47c0269b174653012a2961dcfb9a50c6f7d5576 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 11 Sep 2019 15:16:53 +0200 Subject: [PATCH 130/207] Update morphology features --- spacy/morphology.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 2d58b8f27..8fcf81ade 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -864,13 +864,13 @@ FEATURES = [ "Animacy_hum", "Animacy_inan", "Animacy_nhum", - "Aspect_freq", + "Aspect_hab", "Aspect_imp", - "Aspect_mod", - "Aspect_none", + "Aspect_iter", "Aspect_perf", "Aspect_prog", "Aspect_prosp", + "Aspect_none", "Case_abe", "Case_abl", "Case_abs", From f8ce9dde0fb07bdbcfa7699a91ad5cbe064e3fcd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 11 Sep 2019 17:41:21 +0200 Subject: [PATCH 131/207] Set version to v2.2.0.dev5 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 944bcae77..dba60ffc8 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # fmt: off __title__ = "spacy" -__version__ = "2.2.0.dev4" +__version__ = "2.2.0.dev5" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" From f7a096b46257078a81b8219deea8c1c9c74b7209 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 11 Sep 2019 18:06:43 +0200 Subject: [PATCH 132/207] Update morphology --- spacy/morphology.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 8fcf81ade..bf7aaced0 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -985,6 +985,7 @@ FEATURES = [ "NumForm_digit", "NumForm_roman", "NumForm_word", + "NumForm_combi", "NumType_card", "NumType_dist", "NumType_frac", @@ -993,6 +994,7 @@ FEATURES = [ "NumType_none", "NumType_ord", "NumType_sets", + "NumType_dual", "NumValue_one", "NumValue_two", "NumValue_three", From 7fbb559045c8a2127059edc8a17e5722b0e423c6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 11 Sep 2019 18:07:20 +0200 Subject: [PATCH 133/207] Set version to v2.2.0.dev6 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index dba60ffc8..4d3de2d40 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # fmt: off __title__ = "spacy" -__version__ = "2.2.0.dev5" +__version__ = "2.2.0.dev6" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" From 0b4b4f1819df2b8a885a2ce6a89a2bfe88249aba Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Date: Thu, 12 Sep 2019 11:38:34 +0200 Subject: [PATCH 134/207] Documentation for Entity Linking (#4065) * document token ent_kb_id * document span kb_id * update pipeline documentation * prior and context weights as bool's instead * entitylinker api documentation * drop for both models * finish entitylinker documentation * small fixes * documentation for KB * candidate documentation * links to api pages in code * small fix * frequency examples as counts for consistency * consistent documentation about tensors returned by predict * add entity linking to usage 101 * add entity linking infobox and KB section to 101 * entity-linking in linguistic features * small typo corrections * training example and docs for entity_linker * predefined nlp and kb * revert back to similarity encodings for simplicity (for now) * set prior probabilities to 0 when excluded * code clean up * bugfix: deleting kb ID from tokens when entities were removed * refactor train el example to use either model or vocab * pretrain_kb example for example kb generation * add to training docs for KB + EL example scripts * small fixes * error numbering * ensure the language of vocab and nlp stay consistent across serialization * equality with = * avoid conflict in errors file * add error 151 * final adjustements to the train scripts - consistency * update of goldparse documentation * small corrections * push commit * typo fix * add candidate API to kb documentation * update API sidebar with EntityLinker and KnowledgeBase * remove EL from 101 docs * remove entity linker from 101 pipelines / rephrase * custom el model instead of existing model * set version to 2.2 for EL functionality * update documentation for 2 CLI scripts --- examples/pipeline/dummy_entity_linking.py | 0 examples/pipeline/wikidata_entity_linking.py | 0 examples/training/pretrain_kb.py | 5 +- examples/training/train_entity_linker.py | 4 +- spacy/kb.pyx | 2 +- website/docs/api/cli.md | 2 +- website/docs/api/entitylinker.md | 297 +++++++++++++++++++ website/docs/api/entityrecognizer.md | 11 +- website/docs/api/goldparse.md | 22 +- website/docs/api/kb.md | 268 +++++++++++++++++ website/docs/api/span.md | 21 +- website/docs/api/tagger.md | 16 +- website/docs/api/textcategorizer.md | 11 +- website/docs/api/token.md | 4 +- website/docs/usage/101/_named-entities.md | 12 +- website/docs/usage/101/_pipelines.md | 20 +- website/docs/usage/101/_training.md | 2 +- website/docs/usage/facts-figures.md | 2 +- website/docs/usage/linguistic-features.md | 48 +++ website/docs/usage/processing-pipelines.md | 1 + website/docs/usage/spacy-101.md | 79 +++++ website/docs/usage/training.md | 83 +++++- website/meta/sidebars.json | 2 + 23 files changed, 847 insertions(+), 65 deletions(-) create mode 100644 examples/pipeline/dummy_entity_linking.py create mode 100644 examples/pipeline/wikidata_entity_linking.py create mode 100644 website/docs/api/entitylinker.md create mode 100644 website/docs/api/kb.md diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/training/pretrain_kb.py b/examples/training/pretrain_kb.py index d5281ad42..2c494d5c4 100644 --- a/examples/training/pretrain_kb.py +++ b/examples/training/pretrain_kb.py @@ -8,8 +8,8 @@ For more details, see the documentation: * Knowledge base: https://spacy.io/api/kb * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking -Compatible with: spaCy vX.X -Last tested with: vX.X +Compatible with: spaCy v2.2 +Last tested with: v2.2 """ from __future__ import unicode_literals, print_function @@ -73,7 +73,6 @@ def main(vocab_path=None, model=None, output_dir=None, n_iter=50): input_dim=INPUT_DIM, desc_width=DESC_WIDTH, epochs=n_iter, - threshold=0.001, ) encoder.train(description_list=descriptions, to_print=True) diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py index 12ed531a6..d2b2c2417 100644 --- a/examples/training/train_entity_linker.py +++ b/examples/training/train_entity_linker.py @@ -8,8 +8,8 @@ For more details, see the documentation: * Training: https://spacy.io/usage/training * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking -Compatible with: spaCy vX.X -Last tested with: vX.X +Compatible with: spaCy v2.2 +Last tested with: v2.2 """ from __future__ import unicode_literals, print_function diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 176ac17de..6cbc06e2c 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -24,7 +24,7 @@ cdef class Candidate: algorithm which will disambiguate the various candidates to the correct one. Each candidate (alias, entity) pair is assigned to a certain prior probability. - DOCS: https://spacy.io/api/candidate + DOCS: https://spacy.io/api/kb/#candidate_init """ def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob): diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 32e3623b0..d01637925 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -226,7 +226,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` | | `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. | | `--gold-preproc`, `-G` | flag | Use gold preprocessing. | -| `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging ] subtokens. Typically used for languages like Chinese. | +| `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging subtokens. Typically used for languages like Chinese. | | `--verbose`, `-VV` <Tag variant="new">2.0.13</Tag> | flag | Show more detailed messages during training. | | `--help`, `-h` | flag | Show help message and available arguments. | | **CREATES** | model, pickle | A spaCy model on each epoch. | diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md new file mode 100644 index 000000000..64db50943 --- /dev/null +++ b/website/docs/api/entitylinker.md @@ -0,0 +1,297 @@ +--- +title: EntityLinker +teaser: Functionality to disambiguate a named entity in text to a unique knowledge base identifier. +tag: class +source: spacy/pipeline/pipes.pyx +new: 2.2 +--- + +This class is a subclass of `Pipe` and follows the same API. The pipeline +component is available in the [processing pipeline](/usage/processing-pipelines) +via the ID `"entity_linker"`. + +## EntityLinker.Model {#model tag="classmethod"} + +Initialize a model for the pipe. The model should implement the +`thinc.neural.Model` API, and should contain a field `tok2vec` that contains +the context encoder. Wrappers are under development for most major machine +learning libraries. + +| Name | Type | Description | +| ----------- | ------ | ------------------------------------- | +| `**kwargs` | - | Parameters for initializing the model | +| **RETURNS** | object | The initialized model. | + +## EntityLinker.\_\_init\_\_ {#init tag="method"} + +Create a new pipeline instance. In your application, you would normally use a +shortcut for this and instantiate the component using its string name and +[`nlp.create_pipe`](/api/language#create_pipe). + +> #### Example +> +> ```python +> # Construction via create_pipe +> entity_linker = nlp.create_pipe("entity_linker") +> +> # Construction from class +> from spacy.pipeline import EntityLinker +> entity_linker = EntityLinker(nlp.vocab) +> entity_linker.from_disk("/path/to/model") +> ``` + +| Name | Type | Description | +| --------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared vocabulary. | +| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. | +| `hidden_width` | int | Width of the hidden layer of the entity linking model, defaults to 128. | +| `incl_prior` | bool | Whether or not to include prior probabilities in the model. Defaults to True. | +| `incl_context` | bool | Whether or not to include the local context in the model (if not: only prior probabilites are used). Defaults to True. | +| **RETURNS** | `EntityLinker` | The newly constructed object. | + +## EntityLinker.\_\_call\_\_ {#call tag="method"} + +Apply the pipe to one document. The document is modified in place, and returned. +This usually happens under the hood when the `nlp` object is called on a text +and all pipeline components are applied to the `Doc` in order. Both +[`__call__`](/api/entitylinker#call) and +[`pipe`](/api/entitylinker#pipe) delegate to the +[`predict`](/api/entitylinker#predict) and +[`set_annotations`](/api/entitylinker#set_annotations) methods. + +> #### Example +> +> ```python +> entity_linker = EntityLinker(nlp.vocab) +> doc = nlp(u"This is a sentence.") +> # This usually happens under the hood +> processed = entity_linker(doc) +> ``` + +| Name | Type | Description | +| ----------- | ----- | ------------------------ | +| `doc` | `Doc` | The document to process. | +| **RETURNS** | `Doc` | The processed document. | + +## EntityLinker.pipe {#pipe tag="method"} + +Apply the pipe to a stream of documents. This usually happens under the hood +when the `nlp` object is called on a text and all pipeline components are +applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and +[`pipe`](/api/entitylinker#pipe) delegate to the +[`predict`](/api/entitylinker#predict) and +[`set_annotations`](/api/entitylinker#set_annotations) methods. + +> #### Example +> +> ```python +> entity_linker = EntityLinker(nlp.vocab) +> for doc in entity_linker.pipe(docs, batch_size=50): +> pass +> ``` + +| Name | Type | Description | +| ------------ | -------- | ------------------------------------------------------ | +| `stream` | iterable | A stream of documents. | +| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | +| **YIELDS** | `Doc` | Processed documents in the order of the original text. | + +## EntityLinker.predict {#predict tag="method"} + +Apply the pipeline's model to a batch of docs, without modifying them. + +> #### Example +> +> ```python +> entity_linker = EntityLinker(nlp.vocab) +> kb_ids, tensors = entity_linker.predict([doc1, doc2]) +> ``` + +| Name | Type | Description | +| ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `docs` | iterable | The documents to predict. | +| **RETURNS** | tuple | A `(kb_ids, tensors)` tuple where `kb_ids` are the model's predicted KB identifiers for the entities in the `docs`, and `tensors` are the token representations used to predict these identifiers. | + +## EntityLinker.set_annotations {#set_annotations tag="method"} + +Modify a batch of documents, using pre-computed entity IDs for a list of named entities. + +> #### Example +> +> ```python +> entity_linker = EntityLinker(nlp.vocab) +> kb_ids, tensors = entity_linker.predict([doc1, doc2]) +> entity_linker.set_annotations([doc1, doc2], kb_ids, tensors) +> ``` + +| Name | Type | Description | +| ---------- | -------- | --------------------------------------------------------------------------------------------------- | +| `docs` | iterable | The documents to modify. | +| `kb_ids` | iterable | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. | +| `tensors` | iterable | The token representations used to predict the identifiers. | + +## EntityLinker.update {#update tag="method"} + +Learn from a batch of documents and gold-standard information, updating both the +pipe's entity linking model and context encoder. Delegates to [`predict`](/api/entitylinker#predict) and +[`get_loss`](/api/entitylinker#get_loss). + +> #### Example +> +> ```python +> entity_linker = EntityLinker(nlp.vocab) +> losses = {} +> optimizer = nlp.begin_training() +> entity_linker.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer) +> ``` + +| Name | Type | Description | +| -------- | -------- | ------------------------------------------------------------------------------------------------------------- | +| `docs` | iterable | A batch of documents to learn from. | +| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | +| `drop` | float | The dropout rate, used both for the EL model and the context encoder. | +| `sgd` | callable | The optimizer for the EL model. Should take two arguments `weights` and `gradient`, and an optional ID. | +| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. | + +## EntityLinker.get_loss {#get_loss tag="method"} + +Find the loss and gradient of loss for the entities in a batch of documents and their +predicted scores. + +> #### Example +> +> ```python +> entity_linker = EntityLinker(nlp.vocab) +> kb_ids, tensors = entity_linker.predict(docs) +> loss, d_loss = entity_linker.get_loss(docs, [gold1, gold2], kb_ids, tensors) +> ``` + +| Name | Type | Description | +| --------------- | -------- | ------------------------------------------------------------ | +| `docs` | iterable | The batch of documents. | +| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | +| `kb_ids` | iterable | KB identifiers representing the model's predictions. | +| `tensors` | iterable | The token representations used to predict the identifiers | +| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. | + +## EntityLinker.set_kb {#set_kb tag="method"} + +Define the knowledge base (KB) used for disambiguating named entities to KB identifiers. + +> #### Example +> +> ```python +> entity_linker = EntityLinker(nlp.vocab) +> entity_linker.set_kb(kb) +> ``` + +| Name | Type | Description | +| --------------- | --------------- | ------------------------------------------------------------ | +| `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb). | + +## EntityLinker.begin_training {#begin_training tag="method"} + +Initialize the pipe for training, using data examples if available. If no model +has been initialized yet, the model is added. +Before calling this method, a knowledge base should have been defined with [`set_kb`](/api/entitylinker#set_kb). + +> #### Example +> +> ```python +> entity_linker = EntityLinker(nlp.vocab) +> entity_linker.set_kb(kb) +> nlp.add_pipe(entity_linker, last=True) +> optimizer = entity_linker.begin_training(pipeline=nlp.pipeline) +> ``` + +| Name | Type | Description | +| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. | +| `pipeline` | list | Optional list of pipeline components that this component is part of. | +| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityLinker`](/api/entitylinker#create_optimizer) if not set. | +| **RETURNS** | callable | An optimizer. | + +## EntityLinker.create_optimizer {#create_optimizer tag="method"} + +Create an optimizer for the pipeline component. + +> #### Example +> +> ```python +> entity_linker = EntityLinker(nlp.vocab) +> optimizer = entity_linker.create_optimizer() +> ``` + +| Name | Type | Description | +| ----------- | -------- | -------------- | +| **RETURNS** | callable | The optimizer. | + +## EntityLinker.use_params {#use_params tag="method, contextmanager"} + +Modify the pipe's EL model, to use the given parameter values. + +> #### Example +> +> ```python +> entity_linker = EntityLinker(nlp.vocab) +> with entity_linker.use_params(optimizer.averages): +> entity_linker.to_disk("/best_model") +> ``` + +| Name | Type | Description | +| -------- | ---- | ---------------------------------------------------------------------------------------------------------- | +| `params` | dict | The parameter values to use in the model. At the end of the context, the original parameters are restored. | + + +## EntityLinker.to_disk {#to_disk tag="method"} + +Serialize the pipe to disk. + +> #### Example +> +> ```python +> entity_linker = EntityLinker(nlp.vocab) +> entity_linker.to_disk("/path/to/entity_linker") +> ``` + +| Name | Type | Description | +| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | + +## EntityLinker.from_disk {#from_disk tag="method"} + +Load the pipe from disk. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> entity_linker = EntityLinker(nlp.vocab) +> entity_linker.from_disk("/path/to/entity_linker") +> ``` + +| Name | Type | Description | +| ----------- | ------------------ | -------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | + +## Serialization fields {#serialization-fields} + +During serialization, spaCy will export several data fields used to restore +different aspects of the object. If needed, you can exclude them from +serialization by passing in the string names via the `exclude` argument. + +> #### Example +> +> ```python +> data = entity_linker.to_disk("/path", exclude=["vocab"]) +> ``` + +| Name | Description | +| ------- | -------------------------------------------------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `cfg` | The config file. You usually don't want to exclude this. | +| `model` | The binary model data. You usually don't want to exclude this. | +| `kb` | The knowledge base. You usually don't want to exclude this. | + diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 7279a7f77..46e8b44ee 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -99,7 +99,7 @@ Apply the pipeline's model to a batch of docs, without modifying them. > > ```python > ner = EntityRecognizer(nlp.vocab) -> scores = ner.predict([doc1, doc2]) +> scores, tensors = ner.predict([doc1, doc2]) > ``` | Name | Type | Description | @@ -115,14 +115,15 @@ Modify a batch of documents, using pre-computed scores. > > ```python > ner = EntityRecognizer(nlp.vocab) -> scores = ner.predict([doc1, doc2]) -> ner.set_annotations([doc1, doc2], scores) +> scores, tensors = ner.predict([doc1, doc2]) +> ner.set_annotations([doc1, doc2], scores, tensors) > ``` | Name | Type | Description | | -------- | -------- | ---------------------------------------------------------- | | `docs` | iterable | The documents to modify. | | `scores` | - | The scores to set, produced by `EntityRecognizer.predict`. | +| `tensors`| iterable | The token representations used to predict the scores. | ## EntityRecognizer.update {#update tag="method"} @@ -210,13 +211,13 @@ Modify the pipe's model, to use the given parameter values. > > ```python > ner = EntityRecognizer(nlp.vocab) -> with ner.use_params(): +> with ner.use_params(optimizer.averages): > ner.to_disk("/best_model") > ``` | Name | Type | Description | | -------- | ---- | ---------------------------------------------------------------------------------------------------------- | -| `params` | - | The parameter values to use in the model. At the end of the context, the original parameters are restored. | +| `params` | dict | The parameter values to use in the model. At the end of the context, the original parameters are restored. | ## EntityRecognizer.add_label {#add_label tag="method"} diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md index 5a2d8a110..db7d07795 100644 --- a/website/docs/api/goldparse.md +++ b/website/docs/api/goldparse.md @@ -23,6 +23,7 @@ gradient for those labels will be zero. | `deps` | iterable | A sequence of strings, representing the syntactic relation types. | | `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. | | `cats` | dict | Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). | +| `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either 1.0 (positive) or 0.0 (negative). | | **RETURNS** | `GoldParse` | The newly constructed object. | ## GoldParse.\_\_len\_\_ {#len tag="method"} @@ -43,16 +44,17 @@ Whether the provided syntactic annotations form a projective dependency tree. ## Attributes {#attributes} -| Name | Type | Description | -| --------------------------------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `words` | list | The words. | -| `tags` | list | The part-of-speech tag annotations. | -| `heads` | list | The syntactic head annotations. | -| `labels` | list | The syntactic relation-type annotations. | -| `ner` | list | The named entity annotations as BILUO tags. | -| `cand_to_gold` | list | The alignment from candidate tokenization to gold tokenization. | -| `gold_to_cand` | list | The alignment from gold tokenization to candidate tokenization. | -| `cats` <Tag variant="new">2</Tag> | list | Entries in the list should be either a label, or a `(start, end, label)` triple. The tuple form is used for categories applied to spans of the document. | +| Name | Type | Description | +| ------------------------------------ | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `words` | list | The words. | +| `tags` | list | The part-of-speech tag annotations. | +| `heads` | list | The syntactic head annotations. | +| `labels` | list | The syntactic relation-type annotations. | +| `ner` | list | The named entity annotations as BILUO tags. | +| `cand_to_gold` | list | The alignment from candidate tokenization to gold tokenization. | +| `gold_to_cand` | list | The alignment from gold tokenization to candidate tokenization. | +| `cats` <Tag variant="new">2</Tag> | list | Entries in the list should be either a label, or a `(start, end, label)` triple. The tuple form is used for categories applied to spans of the document. | +| `links` <Tag variant="new">2.2</Tag> | dict | Keys in the dictionary are `(start_char, end_char)` triples, and the values are dictionaries with `kb_id:value` entries. | ## Utilities {#util} diff --git a/website/docs/api/kb.md b/website/docs/api/kb.md new file mode 100644 index 000000000..639ababb6 --- /dev/null +++ b/website/docs/api/kb.md @@ -0,0 +1,268 @@ +--- +title: KnowledgeBase +teaser: A storage class for entities and aliases of a specific knowledge base (ontology) +tag: class +source: spacy/kb.pyx +new: 2.2 +--- + +The `KnowledgeBase` object provides a method to generate [`Candidate`](/api/kb/#candidate_init) +objects, which are plausible external identifiers given a certain textual mention. +Each such `Candidate` holds information from the relevant KB entities, +such as its frequency in text and possible aliases. +Each entity in the knowledge base also has a pre-trained entity vector of a fixed size. + +## KnowledgeBase.\_\_init\_\_ {#init tag="method"} + +Create the knowledge base. + +> #### Example +> +> ```python +> from spacy.kb import KnowledgeBase +> vocab = nlp.vocab +> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64) +> ``` + +| Name | Type | Description | +| ----------------------- | ---------------- | ----------------------------------------- | +| `vocab` | `Vocab` | A `Vocab` object. | +| `entity_vector_length` | int | Length of the fixed-size entity vectors. | +| **RETURNS** | `KnowledgeBase` | The newly constructed object. | + + +## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"} + +The length of the fixed-size entity vectors in the knowledge base. + +| Name | Type | Description | +| ----------- | ---- | ----------------------------------------- | +| **RETURNS** | int | Length of the fixed-size entity vectors. | + +## KnowledgeBase.add_entity {#add_entity tag="method"} + +Add an entity to the knowledge base, specifying its corpus frequency +and entity vector, which should be of length [`entity_vector_length`](/api/kb#entity_vector_length). + +> #### Example +> +> ```python +> kb.add_entity(entity="Q42", freq=32, entity_vector=vector1) +> kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2) +> ``` + +| Name | Type | Description | +| --------------- | ------------- | ------------------------------------------------- | +| `entity` | unicode | The unique entity identifier | +| `freq` | float | The frequency of the entity in a typical corpus | +| `entity_vector` | vector | The pre-trained vector of the entity | + +## KnowledgeBase.set_entities {#set_entities tag="method"} + +Define the full list of entities in the knowledge base, specifying the corpus frequency +and entity vector for each entity. + +> #### Example +> +> ```python +> kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2]) +> ``` + +| Name | Type | Description | +| ------------- | ------------- | ------------------------------------------------- | +| `entity_list` | iterable | List of unique entity identifiers | +| `freq_list` | iterable | List of entity frequencies | +| `vector_list` | iterable | List of entity vectors | + +## KnowledgeBase.add_alias {#add_alias tag="method"} + +Add an alias or mention to the knowledge base, specifying its potential KB identifiers +and their prior probabilities. The entity identifiers should refer to entities previously +added with [`add_entity`](/api/kb#add_entity) or [`set_entities`](/api/kb#set_entities). +The sum of the prior probabilities should not exceed 1. + +> #### Example +> +> ```python +> kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3]) +> ``` + +| Name | Type | Description | +| -------------- | ------------- | -------------------------------------------------- | +| `alias` | unicode | The textual mention or alias | +| `entities` | iterable | The potential entities that the alias may refer to | +| `probabilities`| iterable | The prior probabilities of each entity | + +## KnowledgeBase.\_\_len\_\_ {#len tag="method"} + +Get the total number of entities in the knowledge base. + +> #### Example +> +> ```python +> total_entities = len(kb) +> ``` + +| Name | Type | Description | +| ----------- | ---- | --------------------------------------------- | +| **RETURNS** | int | The number of entities in the knowledge base. | + +## KnowledgeBase.get_entity_strings {#get_entity_strings tag="method"} + +Get a list of all entity IDs in the knowledge base. + +> #### Example +> +> ```python +> all_entities = kb.get_entity_strings() +> ``` + +| Name | Type | Description | +| ----------- | ---- | --------------------------------------------- | +| **RETURNS** | list | The list of entities in the knowledge base. | + +## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"} + +Get the total number of aliases in the knowledge base. + +> #### Example +> +> ```python +> total_aliases = kb.get_size_aliases() +> ``` + +| Name | Type | Description | +| ----------- | ---- | --------------------------------------------- | +| **RETURNS** | int | The number of aliases in the knowledge base. | + +## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"} + +Get a list of all aliases in the knowledge base. + +> #### Example +> +> ```python +> all_aliases = kb.get_alias_strings() +> ``` + +| Name | Type | Description | +| ----------- | ---- | --------------------------------------------- | +| **RETURNS** | list | The list of aliases in the knowledge base. | + +## KnowledgeBase.get_candidates {#get_candidates tag="method"} + +Given a certain textual mention as input, retrieve a list of candidate entities +of type [`Candidate`](/api/kb/#candidate_init). + +> #### Example +> +> ```python +> candidates = kb.get_candidates("Douglas") +> ``` + +| Name | Type | Description | +| ------------- | ------------- | -------------------------------------------------- | +| `alias` | unicode | The textual mention or alias | +| **RETURNS** | iterable | The list of relevant `Candidate` objects | + +## KnowledgeBase.get_vector {#get_vector tag="method"} + +Given a certain entity ID, retrieve its pre-trained entity vector. + +> #### Example +> +> ```python +> vector = kb.get_vector("Q42") +> ``` + +| Name | Type | Description | +| ------------- | ------------- | -------------------------------------------------- | +| `entity` | unicode | The entity ID | +| **RETURNS** | vector | The entity vector | + +## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"} + +Given a certain entity ID and a certain textual mention, retrieve +the prior probability of the fact that the mention links to the entity ID. + +> #### Example +> +> ```python +> probability = kb.get_prior_prob("Q42", "Douglas") +> ``` + +| Name | Type | Description | +| ------------- | ------------- | --------------------------------------------------------------- | +| `entity` | unicode | The entity ID | +| `alias` | unicode | The textual mention or alias | +| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` | + +## KnowledgeBase.dump {#dump tag="method"} + +Save the current state of the knowledge base to a directory. + +> #### Example +> +> ```python +> kb.dump(loc) +> ``` + +| Name | Type | Description | +| ------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `loc` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | + +## KnowledgeBase.load_bulk {#load_bulk tag="method"} + +Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab) +should also be the same as the one used to create the KB. + +> #### Example +> +> ```python +> from spacy.kb import KnowledgeBase +> from spacy.vocab import Vocab +> vocab = Vocab().from_disk("/path/to/vocab") +> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64) +> kb.load_bulk("/path/to/kb") +> ``` + + +| Name | Type | Description | +| ----------- | ---------------- | ----------------------------------------------------------------------------------------- | +| `loc` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. | + + +## Candidate.\_\_init\_\_ {#candidate_init tag="method"} + +Construct a `Candidate` object. Usually this constructor is not called directly, +but instead these objects are returned by the [`get_candidates`](/api/kb#get_candidates) method +of a `KnowledgeBase`. + +> #### Example +> +> ```python +> from spacy.kb import Candidate +> candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob) +> ``` + +| Name | Type | Description | +| ------------- | --------------- | -------------------------------------------------------------- | +| `kb` | `KnowledgeBase` | The knowledge base that defined this candidate. | +| `entity_hash` | int | The hash of the entity's KB ID. | +| `entity_freq` | float | The entity frequency as recorded in the KB. | +| `alias_hash` | int | The hash of the textual mention or alias. | +| `prior_prob` | float | The prior probability of the `alias` referring to the `entity` | +| **RETURNS** | `Candidate` | The newly constructed object. | + +## Candidate attributes {#candidate_attributes} + +| Name | Type | Description | +| ---------------------- | ------------ | ------------------------------------------------------------------ | +| `entity` | int | The entity's unique KB identifier | +| `entity_` | unicode | The entity's unique KB identifier | +| `alias` | int | The alias or textual mention | +| `alias_` | unicode | The alias or textual mention | +| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` | +| `entity_freq` | long | The frequency of the entity in a typical corpus | +| `entity_vector` | vector | The pre-trained vector of the entity | diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 0af305b37..79be81ef8 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -18,14 +18,15 @@ Create a Span object from the slice `doc[start : end]`. > assert [t.text for t in span] == [u"it", u"back", u"!"] > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The parent document. | -| `start` | int | The index of the first token of the span. | -| `end` | int | The index of the first token after the span. | -| `label` | int / unicode | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a unicode string. | -| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | -| **RETURNS** | `Span` | The newly constructed object. | +| Name | Type | Description | +| ----------- | ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------| +| `doc` | `Doc` | The parent document. | +| `start` | int | The index of the first token of the span. | +| `end` | int | The index of the first token after the span. | +| `label` | int / unicode | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a unicode string. | +| `kb_id` | int / unicode | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a unicode string. | +| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | +| **RETURNS** | `Span` | The newly constructed object. | ## Span.\_\_getitem\_\_ {#getitem tag="method"} @@ -477,9 +478,11 @@ The L2 norm of the span's vector representation. | `text_with_ws` | unicode | The text content of the span with a trailing whitespace character if the last token has one. | | `orth` | int | ID of the verbatim text content. | | `orth_` | unicode | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. | -| `label` | int | The span's label. | +| `label` | int | The hash value of the span's label. | | `label_` | unicode | The span's label. | | `lemma_` | unicode | The span's lemma. | +| `kb_id` | int | The hash value of the knowledge base ID referred to by the span. | +| `kb_id_` | unicode | The knowledge base ID referred to by the span. | | `ent_id` | int | The hash value of the named entity the token is an instance of. | | `ent_id_` | unicode | The string ID of the named entity the token is an instance of. | | `sentiment` | float | A scalar value indicating the positivity or negativity of the span. | diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index a1d921b41..fc6fc67a6 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -97,7 +97,7 @@ Apply the pipeline's model to a batch of docs, without modifying them. > > ```python > tagger = Tagger(nlp.vocab) -> scores = tagger.predict([doc1, doc2]) +> scores, tensors = tagger.predict([doc1, doc2]) > ``` | Name | Type | Description | @@ -113,14 +113,16 @@ Modify a batch of documents, using pre-computed scores. > > ```python > tagger = Tagger(nlp.vocab) -> scores = tagger.predict([doc1, doc2]) -> tagger.set_annotations([doc1, doc2], scores) +> scores, tensors = tagger.predict([doc1, doc2]) +> tagger.set_annotations([doc1, doc2], scores, tensors) > ``` -| Name | Type | Description | -| -------- | -------- | ------------------------------------------------ | -| `docs` | iterable | The documents to modify. | -| `scores` | - | The scores to set, produced by `Tagger.predict`. | +| Name | Type | Description | +| -------- | -------- | ----------------------------------------------------- | +| `docs` | iterable | The documents to modify. | +| `scores` | - | The scores to set, produced by `Tagger.predict`. | +| `tensors`| iterable | The token representations used to predict the scores. | + ## Tagger.update {#update tag="method"} diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index 310122b9c..f7158541b 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -116,7 +116,7 @@ Apply the pipeline's model to a batch of docs, without modifying them. > > ```python > textcat = TextCategorizer(nlp.vocab) -> scores = textcat.predict([doc1, doc2]) +> scores, tensors = textcat.predict([doc1, doc2]) > ``` | Name | Type | Description | @@ -132,14 +132,15 @@ Modify a batch of documents, using pre-computed scores. > > ```python > textcat = TextCategorizer(nlp.vocab) -> scores = textcat.predict([doc1, doc2]) -> textcat.set_annotations([doc1, doc2], scores) +> scores, tensors = textcat.predict([doc1, doc2]) +> textcat.set_annotations([doc1, doc2], scores, tensors) > ``` | Name | Type | Description | | -------- | -------- | --------------------------------------------------------- | | `docs` | iterable | The documents to modify. | | `scores` | - | The scores to set, produced by `TextCategorizer.predict`. | +| `tensors`| iterable | The token representations used to predict the scores. | ## TextCategorizer.update {#update tag="method"} @@ -227,13 +228,13 @@ Modify the pipe's model, to use the given parameter values. > > ```python > textcat = TextCategorizer(nlp.vocab) -> with textcat.use_params(): +> with textcat.use_params(optimizer.averages): > textcat.to_disk("/best_model") > ``` | Name | Type | Description | | -------- | ---- | ---------------------------------------------------------------------------------------------------------- | -| `params` | - | The parameter values to use in the model. At the end of the context, the original parameters are restored. | +| `params` | dict | The parameter values to use in the model. At the end of the context, the original parameters are restored. | ## TextCategorizer.add_label {#add_label tag="method"} diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 24816b401..8da13454b 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -425,8 +425,10 @@ The L2 norm of the token's vector representation. | `i` | int | The index of the token within the parent document. | | `ent_type` | int | Named entity type. | | `ent_type_` | unicode | Named entity type. | -| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | | +| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | | `ent_iob_` | unicode | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | +| `ent_kb_id` <Tag variant="new">2.2</Tag> | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | +| `ent_kb_id_` <Tag variant="new">2.2</Tag> | unicode | Knowledge base ID that refers to the named entity this token is a part of, if any. | | `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | | `ent_id_` | unicode | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | | `lemma` | int | Base form of the token, with no inflectional suffixes. | diff --git a/website/docs/usage/101/_named-entities.md b/website/docs/usage/101/_named-entities.md index 54db6dbe8..a282ec370 100644 --- a/website/docs/usage/101/_named-entities.md +++ b/website/docs/usage/101/_named-entities.md @@ -1,5 +1,5 @@ A named entity is a "real-world object" that's assigned a name – for example, a -person, a country, a product or a book title. spaCy can **recognize** +person, a country, a product or a book title. spaCy can **recognize** [various types](/api/annotation#named-entities) of named entities in a document, by asking the model for a **prediction**. Because models are statistical and strongly depend on the examples they were trained on, this doesn't always work @@ -21,12 +21,12 @@ for ent in doc.ents: > - **Text:** The original entity text. > - **Start:** Index of start of entity in the `Doc`. > - **End:** Index of end of entity in the `Doc`. -> - **LabeL:** Entity label, i.e. type. +> - **Label:** Entity label, i.e. type. -| Text | Start | End | Label | Description | -| ----------- | :---: | :-: | ------- | ---------------------------------------------------- | -| Apple | 0 | 5 | `ORG` | Companies, agencies, institutions. | -| U.K. | 27 | 31 | `GPE` | Geopolitical entity, i.e. countries, cities, states. | +| Text | Start | End | Label | Description | +| ----------- | :---: | :-: | ------- | ---------------------------------------------------- | +| Apple | 0 | 5 | `ORG` | Companies, agencies, institutions. | +| U.K. | 27 | 31 | `GPE` | Geopolitical entity, i.e. countries, cities, states. | | \$1 billion | 44 | 54 | `MONEY` | Monetary values, including unit. | Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what diff --git a/website/docs/usage/101/_pipelines.md b/website/docs/usage/101/_pipelines.md index 68308a381..d33ea45fd 100644 --- a/website/docs/usage/101/_pipelines.md +++ b/website/docs/usage/101/_pipelines.md @@ -12,14 +12,14 @@ passed on to the next component. > - **Creates:** Objects, attributes and properties modified and set by the > component. -| Name | Component | Creates | Description | -| ------------- | ------------------------------------------------------------------ | ----------------------------------------------------------- | ------------------------------------------------ | -| **tokenizer** | [`Tokenizer`](/api/tokenizer) | `Doc` | Segment text into tokens. | -| **tagger** | [`Tagger`](/api/tagger) | `Doc[i].tag` | Assign part-of-speech tags. | -| **parser** | [`DependencyParser`](/api/dependencyparser) | `Doc[i].head`, `Doc[i].dep`, `Doc.sents`, `Doc.noun_chunks` | Assign dependency labels. | -| **ner** | [`EntityRecognizer`](/api/entityrecognizer) | `Doc.ents`, `Doc[i].ent_iob`, `Doc[i].ent_type` | Detect and label named entities. | -| **textcat** | [`TextCategorizer`](/api/textcategorizer) | `Doc.cats` | Assign document labels. | -| ... | [custom components](/usage/processing-pipelines#custom-components) | `Doc._.xxx`, `Token._.xxx`, `Span._.xxx` | Assign custom attributes, methods or properties. | +| Name | Component | Creates | Description | +| ----------------- | ------------------------------------------------------------------ | ----------------------------------------------------------- | ------------------------------------------------ | +| **tokenizer** | [`Tokenizer`](/api/tokenizer) | `Doc` | Segment text into tokens. | +| **tagger** | [`Tagger`](/api/tagger) | `Doc[i].tag` | Assign part-of-speech tags. | +| **parser** | [`DependencyParser`](/api/dependencyparser) | `Doc[i].head`, `Doc[i].dep`, `Doc.sents`, `Doc.noun_chunks` | Assign dependency labels. | +| **ner** | [`EntityRecognizer`](/api/entityrecognizer) | `Doc.ents`, `Doc[i].ent_iob`, `Doc[i].ent_type` | Detect and label named entities. | +| **textcat** | [`TextCategorizer`](/api/textcategorizer) | `Doc.cats` | Assign document labels. | +| ... | [custom components](/usage/processing-pipelines#custom-components) | `Doc._.xxx`, `Token._.xxx`, `Span._.xxx` | Assign custom attributes, methods or properties. | The processing pipeline always **depends on the statistical model** and its capabilities. For example, a pipeline can only include an entity recognizer @@ -49,6 +49,10 @@ them, its dependency predictions may be different. Similarly, it matters if you add the [`EntityRuler`](/api/entityruler) before or after the statistical entity recognizer: if it's added before, the entity recognizer will take the existing entities into account when making predictions. +The [`EntityLinker`](/api/entitylinker), which resolves named entities to +knowledge base IDs, should be preceded by +a pipeline component that recognizes entities such as the +[`EntityRecognizer`](/api/entityrecognizer). </Accordion> diff --git a/website/docs/usage/101/_training.md b/website/docs/usage/101/_training.md index 61e047748..baf3a1891 100644 --- a/website/docs/usage/101/_training.md +++ b/website/docs/usage/101/_training.md @@ -20,7 +20,7 @@ difference, the more significant the gradient and the updates to our model.  When training a model, we don't just want it to memorize our examples – we want -it to come up with theory that can be **generalized across other examples**. +it to come up with a theory that can be **generalized across other examples**. After all, we don't just want the model to learn that this one instance of "Amazon" right here is a company – we want it to learn that "Amazon", in contexts _like this_, is most likely a company. That's why the training data diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md index a3683b668..40b39d871 100644 --- a/website/docs/usage/facts-figures.md +++ b/website/docs/usage/facts-figures.md @@ -26,7 +26,7 @@ Here's a quick comparison of the functionalities offered by spaCy, | Sentence segmentation | ✅ | ✅ | ✅ | | Dependency parsing | ✅ | ❌ | ✅ | | Entity recognition | ✅ | ✅ | ✅ | -| Entity linking | ❌ | ❌ | ❌ | +| Entity linking | ✅ | ❌ | ❌ | | Coreference resolution | ❌ | ❌ | ✅ | ### When should I use what? {#comparison-usage} diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 66ad816f5..fc1f159ce 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -576,6 +576,54 @@ import DisplacyEntHtml from 'images/displacy-ent2.html' <Iframe title="displaCy visualizer for entities" html={DisplacyEntHtml} height={180} /> +## Entity Linking {#entity-linking} + +To ground the named entities into the "real-world", +spaCy provides functionality to perform entity linking, which resolves a textual entity +to a unique identifier from a knowledge base (KB). + +The default model assigns WikiData identifiers, but you can create your own +[`KnowledgeBase`](/api/kb) and [train a new Entity Linking model](/usage/training#entity-linker) using +that custom-made KB. + + +### Accessing entity identifiers {#accessing} + +The annotated KB identifier is accessible as either a hash value +or as a string, using the attributes +`ent.kb_id` and `ent.kb_id_` of a [`Span`](/api/span) object, +or the `ent_kb_id` and `ent_kb_id_` attributes of a [`Token`](/api/token) object. + + +```python +### {executable="true"} +import spacy + +nlp = spacy.load("my_custom_el_model") +doc = nlp(u"Ada Lovelace was born in London") + +# document level +ents = [(e.text, e.label_, e.kb_id_) for e in doc.ents] +print(ents) # [('Ada Lovelace', 'PERSON', 'Q7259'), ('London', 'GPE', 'Q84')] + +# token level +ent_ada_0 = [doc[0].text, doc[0].ent_type_, doc[0].ent_kb_id_] +ent_ada_1 = [doc[1].text, doc[1].ent_type_, doc[1].ent_kb_id_] +ent_london_5 = [doc[5].text, doc[5].ent_type_, doc[5].ent_kb_id_] +print(ent_ada_0) # ['Ada', 'PERSON', 'Q7259'] +print(ent_ada_1) # ['Lovelace', 'PERSON', 'Q7259'] +print(ent_london_5) # ['London', 'GPE', 'Q84'] +``` + +| Text | ent_type\_ | ent_kb_id\_ | +| --------- | ---------- | ------------ | +| Ada | `"PERSON"` | `"Q7259"` | +| Lovelace | `"PERSON"` | `"Q7259"` | +| was | `""` | `""` | +| born | `""` | `""` | +| in | `""` | `""` | +| London | `"GPE"` | `"Q84"` | + ## Tokenization {#tokenization} Tokenization is the task of splitting a text into meaningful segments, called diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index f3c59da7b..51a57d7f5 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -213,6 +213,7 @@ require them in the pipeline settings in your model's `meta.json`. | `tagger` | [`Tagger`](/api/tagger) | Assign part-of-speech-tags. | | `parser` | [`DependencyParser`](/api/dependencyparser) | Assign dependency labels. | | `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. | +| `entity_linker` | [`EntityLinker`](/api/entitylinker) | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. | | `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories. | | `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules. | | `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. | diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index 03feb03b1..081b6d896 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -122,6 +122,7 @@ related to more general machine learning functionality. | **Lemmatization** | Assigning the base forms of words. For example, the lemma of "was" is "be", and the lemma of "rats" is "rat". | | **Sentence Boundary Detection** (SBD) | Finding and segmenting individual sentences. | | **Named Entity Recognition** (NER) | Labelling named "real-world" objects, like persons, companies or locations. | +| **Entity Linking** (EL) | Disambiguating textual entities to unique identifiers in a Knowledge Base. | | **Similarity** | Comparing words, text spans and documents and how similar they are to each other. | | **Text Classification** | Assigning categories or labels to a whole document, or parts of a document. | | **Rule-based Matching** | Finding sequences of tokens based on their texts and linguistic annotations, similar to regular expressions. | @@ -237,6 +238,15 @@ of a model, see the usage guides on </Infobox> +<Infobox title="📖 Entity Linking"> + +To learn more about entity linking in spaCy, and how to **train and update** +the entity linker predictions, see the usage guides on +[entity linking](/usage/linguistic-features#entity-linking) and +[training the entity linker](/usage/training#entity-linker). + +</Infobox> + ### Word vectors and similarity {#vectors-similarity model="vectors"} import Vectors101 from 'usage/101/\_vectors-similarity.md' @@ -383,6 +393,75 @@ spaCy will also export the `Vocab` when you save a `Doc` or `nlp` object. This will give you the object and its encoded annotations, plus the "key" to decode it. +## Knowledge Base {#kb} + +To support the entity linking task, spaCy stores external knowledge in a +[`KnowledgeBase`](/api/kb). The knowledge base (KB) uses the `Vocab` to store its +data efficiently. + +> - **Mention**: A textual occurrence of a named entity, e.g. 'Miss Lovelace'. +> - **KB ID**: A unique identifier refering to a particular real-world concept, e.g. 'Q7259'. +> - **Alias**: A plausible synonym or description for a certain KB ID, e.g. 'Ada Lovelace'. +> - **Prior probability**: The probability of a certain mention resolving to a certain KB ID, +prior to knowing anything about the context in which the mention is used. +> - **Entity vector**: A pretrained word vector capturing the entity description. + +A knowledge base is created by first adding all entities to it. Next, for each +potential mention or alias, a list of relevant KB IDs and their prior probabilities +is added. The sum of these prior probabilities should never exceed 1 for any given alias. + + +```python +### {executable="true"} +import spacy +from spacy.kb import KnowledgeBase + +# load the model and create an empty KB +nlp = spacy.load('en_core_web_sm') +kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) + +# adding entities +kb.add_entity(entity="Q1004791", freq=6, entity_vector=[0, 3, 5]) +kb.add_entity(entity="Q42", freq=342, entity_vector=[1, 9, -3]) +kb.add_entity(entity="Q5301561", freq=12, entity_vector=[-2, 4, 2]) + +# adding aliases +kb.add_alias(alias="Douglas", entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.6, 0.1, 0.2]) +kb.add_alias(alias="Douglas Adams", entities=["Q42"], probabilities=[0.9]) + +print() +print("Number of entities in KB:",kb.get_size_entities()) # 3 +print("Number of aliases in KB:", kb.get_size_aliases()) # 2 +``` + +### Candidate generation + +Given a textual entity, the Knowledge Base can provide a list of plausible candidates or +entity identifiers. The [`EntityLinker`](/api/entitylinker) will take this list of candidates +as input, and disambiguate the mention to the most probable identifier, given the +document context. + +```python +### {executable="true"} +import spacy +from spacy.kb import KnowledgeBase + +nlp = spacy.load('en_core_web_sm') +kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) + +# adding entities +kb.add_entity(entity="Q1004791", freq=6, entity_vector=[0, 3, 5]) +kb.add_entity(entity="Q42", freq=342, entity_vector=[1, 9, -3]) +kb.add_entity(entity="Q5301561", freq=12, entity_vector=[-2, 4, 2]) + +# adding aliases +kb.add_alias(alias="Douglas", entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.6, 0.1, 0.2]) + +candidates = kb.get_candidates("Douglas") +for c in candidates: + print(" ", c.entity_, c.prior_prob, c.entity_vector) +``` + ## Serialization {#serialization} import Serialization101 from 'usage/101/\_serialization.md' diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index b84bf4e12..dd5cd8530 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -10,8 +10,9 @@ menu: --- This guide describes how to train new statistical models for spaCy's -part-of-speech tagger, named entity recognizer and dependency parser. Once the -model is trained, you can then [save and load](/usage/saving-loading#models) it. +part-of-speech tagger, named entity recognizer, dependency parser, +text classifier and entity linker. Once the model is trained, +you can then [save and load](/usage/saving-loading#models) it. ## Training basics {#basics} @@ -221,9 +222,10 @@ of being dropped. > - [`begin_training()`](/api/language#begin_training): Start the training and > return an optimizer function to update the model's weights. Can take an -> optional function converting the training data to spaCy's training -> format. -[`update()`](/api/language#update): Update the model with the -> training example and gold data. -[`to_disk()`](/api/language#to_disk): Save +> optional function converting the training data to spaCy's training format. +> - [`update()`](/api/language#update): Update the model with the +> training example and gold data. +> - [`to_disk()`](/api/language#to_disk): Save > the updated model to a directory. ```python @@ -401,6 +403,77 @@ referred to as the "catastrophic forgetting" problem. 4. **Save** the trained model using [`nlp.to_disk`](/api/language#to_disk). 5. **Test** the model to make sure the new entity is recognized correctly. +## Entity linking {#entity-linker} + +To train an entity linking model, you first need to define a knowledge base (KB). + +### Creating a knowledge base {#kb} + +A KB consists of a list of entities with unique identifiers. Each such entity +has an entity vector that will be used to measure similarity with the context in +which an entity is used. These vectors are pretrained and stored in the KB before +the entity linking model will be trained. + +The following example shows how to build a knowledge base from scratch, +given a list of entities and potential aliases. The script further demonstrates +how to pretrain and store the entity vectors. To run this example, the script +needs access to a `vocab` instance or an `nlp` model with pretrained word embeddings. + +```python +https://github.com/explosion/spaCy/tree/master/examples/training/pretrain_kb.py +``` + +#### Step by step guide {#step-by-step-kb} + +1. **Load the model** you want to start with, or create an **empty model** using + [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language and + a pre-defined [`vocab`](/api/vocab) object. +2. **Pretrain the entity embeddings** by running the descriptions of the entities + through a simple encoder-decoder network. The current implementation requires + the `nlp` model to have access to pre-trained word embeddings, but a custom + implementation of this enoding step can also be used. +3. **Construct the KB** by defining all entities with their pretrained vectors, + and all aliases with their prior probabilities. +4. **Save** the KB using [`kb.dump`](/api/kb#dump). +5. **Test** the KB to make sure the entities were added correctly. + +### Training an entity linking model {#entity-linker-model} + +This example shows how to create an entity linker pipe using a previously created +knowledge base. The entity linker pipe is then trained with your own +examples. To do so, you'll need to provide +**example texts**, and the **character offsets** and **knowledge base identifiers** +of each entity contained in the texts. + +```python +https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_linker.py +``` + +#### Step by step guide {#step-by-step-entity-linker} + +1. **Load the KB** you want to start with, and specify the path + to the `Vocab` object that was used to create this KB. + Then, create an **empty model** using + [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language. + Don't forget to add the KB to the entity linker, + and to add the entity linker to the pipeline. + In practical applications, you will want a more advanced pipeline including + also a component for [named entity recognition](/usage/training#ner). + If you're using a model with additional components, make sure to disable all other + pipeline components during training using + [`nlp.disable_pipes`](/api/language#disable_pipes). This way, you'll only be + training the entity linker. +2. **Shuffle and loop over** the examples. For each example, **update the + model** by calling [`nlp.update`](/api/language#update), which steps through + the annotated examples of the input. For each combination of a mention in text and + a potential KB identifier, the model makes a **prediction** whether or not + this is the correct match. It then + consults the annotations to see whether it was right. If it was wrong, it + adjusts its weights so that the correct combination will score higher next time. +3. **Save** the trained model using [`nlp.to_disk`](/api/language#to_disk). +4. **Test** the model to make sure the entities in the training data are + recognized correctly. + ## Training the tagger and parser {#tagger-parser} ### Updating the Dependency Parser {#example-train-parser} diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 31083b091..3c4f09674 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -75,6 +75,7 @@ { "text": "Tagger", "url": "/api/tagger" }, { "text": "DependencyParser", "url": "/api/dependencyparser" }, { "text": "EntityRecognizer", "url": "/api/entityrecognizer" }, + { "text": "EntityLinker", "url": "/api/entitylinker" }, { "text": "TextCategorizer", "url": "/api/textcategorizer" }, { "text": "Matcher", "url": "/api/matcher" }, { "text": "PhraseMatcher", "url": "/api/phrasematcher" }, @@ -89,6 +90,7 @@ { "text": "Vocab", "url": "/api/vocab" }, { "text": "StringStore", "url": "/api/stringstore" }, { "text": "Vectors", "url": "/api/vectors" }, + { "text": "KnowledgeBase", "url": "/api/kb" }, { "text": "GoldParse", "url": "/api/goldparse" }, { "text": "GoldCorpus", "url": "/api/goldcorpus" }, { "text": "Scorer", "url": "/api/scorer" } From 7b59a919e6325037aff2aab7ceb6f509b1b8d38a Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 12 Sep 2019 12:52:06 +0200 Subject: [PATCH 135/207] Update entry points docs [ci skip] --- website/docs/images/displacy-ent-snek.html | 18 ++++ website/docs/usage/saving-loading.md | 96 +++++++++++++++++----- 2 files changed, 93 insertions(+), 21 deletions(-) create mode 100644 website/docs/images/displacy-ent-snek.html diff --git a/website/docs/images/displacy-ent-snek.html b/website/docs/images/displacy-ent-snek.html new file mode 100644 index 000000000..1e4920fb5 --- /dev/null +++ b/website/docs/images/displacy-ent-snek.html @@ -0,0 +1,18 @@ +<div + class="entities" + style="line-height: 2.5; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; font-size: 16px" +> + 🌱🌿 <mark + class="entity" + style="background: #3dff74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone" +>🐍 <span +style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem" +>SNEK</span +></mark> ____ 🌳🌲 ____ <mark +class="entity" +style="background: #cfc5ff; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone" +>👨🌾 <span +style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem" +>HUMAN</span +></mark> 🏘️ +</div> diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index 81e90dcc7..b9e712882 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -238,13 +238,31 @@ custom components to spaCy automatically. ## Using entry points {#entry-points new="2.1"} +Entry points let you expose parts of a Python package you write to other Python +packages. This lets one application easily customize the behavior of another, by +exposing an entry point in its `setup.py`. For a quick and fun intro to entry +points in Python, check out +[this excellent blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/). +spaCy can load custom function from several different entry points to add +pipeline component factories, language classes and other settings. To make spaCy +use your entry points, your package needs to expose them and it needs to be +installed in the same environment – that's it. + +| Entry point | Description | +| ------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [`spacy_factories`](#entry-points-components) | Group of entry points for pipeline component factories to add to [`Language.factories`](/usage/processing-pipelines#custom-components-factories), keyed by component name. | +| [`spacy_languages`](#entry-points-languages) | Group of entry points for custom [`Language` subclasses](/usage/adding-languages), keyed by language shortcut. | +| [`spacy_displacy_colors`](#entry-points-displacy) <Tag variant="new">2.2</Tag> | Group of entry points of custom label colors for the [displaCy visualizer](/usage/visualizers#ent). The key name doesn't matter, but it should point to a dict of labels and color values. Useful for custom models that predict different entity types. | + +### Custom components via entry points {#entry-points-components} + When you load a model, spaCy will generally use the model's `meta.json` to set up the language class and construct the pipeline. The pipeline is specified as a list of strings, e.g. `"pipeline": ["tagger", "paser", "ner"]`. For each of those strings, spaCy will call `nlp.create_pipe` and look up the name in the -[built-in factories](#custom-components-factories). If your model wanted to -specify its own custom components, you usually have to write to -`Language.factories` _before_ loading the model. +[built-in factories](/usage/processing-pipelines#custom-components-factories). +If your model wanted to specify its own custom components, you usually have to +write to `Language.factories` _before_ loading the model. ```python pipe = nlp.create_pipe("custom_component") # fails 👎 @@ -260,13 +278,11 @@ added to the built-in factories when the `Language` class is initialized. If a package in the same environment exposes spaCy entry points, all of this happens automatically and no further user action is required. -#### Custom components via entry points {#entry-points-components} - -For a quick and fun intro to entry points in Python, I recommend -[this excellent blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/). -To stick with the theme of the post, consider the following custom spaCy -extension which is initialized with the shared `nlp` object and will print a -snake when it's called as a pipeline component. +To stick with the theme of +[this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/), +consider the following custom spaCy extension which is initialized with the +shared `nlp` object and will print a snake when it's called as a pipeline +component. > #### Package directory structure > @@ -304,15 +320,13 @@ entry to the factories, you can now expose it in your `setup.py` via the `entry_points` dictionary: ```python -### setup.py {highlight="5-8"} +### setup.py {highlight="5-7"} from setuptools import setup setup( name="snek", entry_points={ - "spacy_factories": [ - "snek = snek:SnekFactory" - ] + "spacy_factories": ["snek = snek:SnekFactory"] } ) ``` @@ -410,7 +424,7 @@ The above example will serialize the current snake in a `snek.txt` in the model data directory. When a model using the `snek` component is loaded, it will open the `snek.txt` and make it available to the component. -#### Custom language classes via entry points {#entry-points-components} +### Custom language classes via entry points {#entry-points-languages} To stay with the theme of the previous example and [this blog post on entry points](https://amir.rachum.com/blog/2017/07/28/python-entry-points/), @@ -446,12 +460,8 @@ from setuptools import setup setup( name="snek", entry_points={ - "spacy_factories": [ - "snek = snek:SnekFactory" - ] -+ "spacy_languages": [ -+ "sk = snek:SnekLanguage" -+ ] + "spacy_factories": ["snek = snek:SnekFactory"], ++ "spacy_languages": ["snk = snek:SnekLanguage"] } ) ``` @@ -481,6 +491,50 @@ SnekLanguage = get_lang_class("snk") nlp = SnekLanguage() ``` +### Custom displaCy colors via entry points {#entry-points-displacy} + +If you're training a named entity recognition model for a custom domain, you may +end up training different labels that don't have pre-defined colors in the +[`displacy` visualizer](/usage/visualizers#ent). The `spacy_displacy_colors` +entry point lets you define a dictionary of entity labels mapped to their color +values. It's added to the existing pre-defined colors and can also overwrite +existing values. + +> #### Domain-specific NER labels +> +> Good examples of models with domain-specific label schemes are +> [scispaCy](/universe/project/scispacy) and +> [Blackstone](/universe/project/blackstone). + +```python +### snek.py +displacy_colors = {"SNEK": "#3dff74", "HUMAN": "#cfc5ff"} +``` + +Given the above colors, the entry point can be defined as follows. Entry points +need to have a name, so we use the key `colors`. However, the name doesn't +matter and whatever is defined in the entry point group will be used. + +```diff +### setup.py +from setuptools import setup + +setup( + name="snek", + entry_points={ ++ "spacy_displacy_colors": ["colors = snek:displacy_colors"] + } +) +``` + +After installing the package, the the custom colors will be used when +visualizing text with `displacy`. Whenever the label `SNEK` is assigned, it +will be displayed in `#3dff74`. + +import DisplaCyEntSnekHtml from 'images/displacy-ent-snek.html' + +<Iframe title="displaCy visualization of entities" html={DisplaCyEntSnekHtml} height={100} /> + ## Saving, loading and distributing models {#models} After training your model, you'll usually want to save its state, and load it From e7c20ad1d272889134aa39016b1aaf1ab2096d3e Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 12 Sep 2019 12:59:10 +0200 Subject: [PATCH 136/207] Update colors entry points docs [ci skip] --- website/docs/usage/saving-loading.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index b9e712882..1ad4824fa 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -491,13 +491,13 @@ SnekLanguage = get_lang_class("snk") nlp = SnekLanguage() ``` -### Custom displaCy colors via entry points {#entry-points-displacy} +### Custom displaCy colors via entry points {#entry-points-displacy new="2.2"} If you're training a named entity recognition model for a custom domain, you may end up training different labels that don't have pre-defined colors in the [`displacy` visualizer](/usage/visualizers#ent). The `spacy_displacy_colors` entry point lets you define a dictionary of entity labels mapped to their color -values. It's added to the existing pre-defined colors and can also overwrite +values. It's added to the pre-defined colors and can also overwrite existing values. > #### Domain-specific NER labels From cb41a33d14fc3974dbfd5a60d9a92ed970d25792 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 12 Sep 2019 12:59:20 +0200 Subject: [PATCH 137/207] Update displaCy API docs [ci skip] --- website/docs/api/top-level.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 9d166a5c5..e9bf48869 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -266,11 +266,12 @@ If a setting is not present in the options, the default value will be used. | -------- | ---- | ------------------------------------------------------------------------------------- | ------- | | `ents` | list | Entity types to highlight (`None` for all types). | `None` | | `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` | +| `template` <Tag variant="new">2.2</Tag> | unicode | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) | By default, displaCy comes with colors for all [entity types supported by spaCy](/api/annotation#named-entities). If you're using custom entity types, you can use the `colors` setting to add your own -colors for them. +colors for them. Your application or model package can also expose a [`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy) to add custom labels and their colors automatically. ## Utility functions {#util source="spacy/util.py"} From 625ce2db8e6edf132f5db2cf28a0737c1bdb8917 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 12 Sep 2019 13:03:38 +0200 Subject: [PATCH 138/207] Update Language docs [ci skip] --- spacy/language.py | 3 ++- website/docs/api/language.md | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index e6322e011..7292e3bf6 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -257,7 +257,8 @@ class Language(object): @property def pipe_labels(self): - """Get the labels set by the pipeline components, if available. + """Get the labels set by the pipeline components, if available (if + the component exposes a labels property). RETURNS (dict): Labels keyed by component name. """ diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 3fcdeb195..913d38ced 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -447,9 +447,10 @@ per component. | --------------------------------------- | ------------------ | ----------------------------------------------------------------------------------------------- | | `vocab` | `Vocab` | A container for the lexical types. | | `tokenizer` | `Tokenizer` | The tokenizer. | -| `make_doc` | `lambda text: Doc` | Create a `Doc` object from unicode text. | +| `make_doc` | `callable` | Callable that takes a unicode text and returns a `Doc`. | | `pipeline` | list | List of `(name, component)` tuples describing the current processing pipeline, in order. | | `pipe_names` <Tag variant="new">2</Tag> | list | List of pipeline component names, in order. | +| `pipe_labels` <Tag variant="new">2.2</Tag> | dict | List of labels set by the pipeline components, if available, keyed by component name. | | `meta` | dict | Custom meta data for the Language class. If a model is loaded, contains meta data of the model. | | `path` <Tag variant="new">2</Tag> | `Path` | Path to the model data directory, if a model is loaded. Otherwise `None`. | From aa4ff0baa180409042b6b6af9f6d1caf95dad03c Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 12 Sep 2019 13:05:53 +0200 Subject: [PATCH 139/207] Auto-format [ci skip] --- website/docs/api/language.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 913d38ced..9a89d01cc 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -443,16 +443,16 @@ per component. ## Attributes {#attributes} -| Name | Type | Description | -| --------------------------------------- | ------------------ | ----------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | A container for the lexical types. | -| `tokenizer` | `Tokenizer` | The tokenizer. | -| `make_doc` | `callable` | Callable that takes a unicode text and returns a `Doc`. | -| `pipeline` | list | List of `(name, component)` tuples describing the current processing pipeline, in order. | -| `pipe_names` <Tag variant="new">2</Tag> | list | List of pipeline component names, in order. | -| `pipe_labels` <Tag variant="new">2.2</Tag> | dict | List of labels set by the pipeline components, if available, keyed by component name. | -| `meta` | dict | Custom meta data for the Language class. If a model is loaded, contains meta data of the model. | -| `path` <Tag variant="new">2</Tag> | `Path` | Path to the model data directory, if a model is loaded. Otherwise `None`. | +| Name | Type | Description | +| ------------------------------------------ | ----------- | ----------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | A container for the lexical types. | +| `tokenizer` | `Tokenizer` | The tokenizer. | +| `make_doc` | `callable` | Callable that takes a unicode text and returns a `Doc`. | +| `pipeline` | list | List of `(name, component)` tuples describing the current processing pipeline, in order. | +| `pipe_names` <Tag variant="new">2</Tag> | list | List of pipeline component names, in order. | +| `pipe_labels` <Tag variant="new">2.2</Tag> | dict | List of labels set by the pipeline components, if available, keyed by component name. | +| `meta` | dict | Custom meta data for the Language class. If a model is loaded, contains meta data of the model. | +| `path` <Tag variant="new">2</Tag> | `Path` | Path to the model data directory, if a model is loaded. Otherwise `None`. | ## Class attributes {#class-attributes} From 32404e613c85151d059e596fd5457bafcf67fdc2 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 12 Sep 2019 14:00:01 +0200 Subject: [PATCH 140/207] Create directory if it doesn't exist --- spacy/lookups.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/lookups.py b/spacy/lookups.py index 741d40330..e639009df 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -114,6 +114,8 @@ class Lookups(object): """ if len(self._tables): path = ensure_path(path) + if not path.exists(): + path.mkdir() filepath = path / "lookups.bin" with filepath.open("wb") as file_: file_.write(self.to_bytes()) From 10257f313160d7c3753f8dbfd59648f28a78c5b9 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 12 Sep 2019 14:00:14 +0200 Subject: [PATCH 141/207] Document Lookups [ci skip] --- spacy/lookups.py | 29 ++-- website/docs/api/lookups.md | 271 ++++++++++++++++++++++++++++++++++++ website/docs/api/vocab.md | 2 + website/meta/sidebars.json | 1 + 4 files changed, 292 insertions(+), 11 deletions(-) create mode 100644 website/docs/api/lookups.md diff --git a/spacy/lookups.py b/spacy/lookups.py index e639009df..a6fa7abff 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -32,7 +32,7 @@ class Lookups(object): Lookups.has_table. name (unicode): Name of the table. - RETURNS (bool): Whether a table of that name exists. + RETURNS (bool): Whether a table of that name is in the lookups. """ return self.has_table(name) @@ -72,7 +72,7 @@ class Lookups(object): def remove_table(self, name): """Remove a table. Raises an error if the table doesn't exist. - name (unicode): The name to remove. + name (unicode): Name of the table to remove. RETURNS (Table): The removed table. """ if name not in self._tables: @@ -87,19 +87,18 @@ class Lookups(object): """ return name in self._tables - def to_bytes(self, exclude=tuple(), **kwargs): + def to_bytes(self, **kwargs): """Serialize the lookups to a bytestring. - exclude (list): String names of serialization fields to exclude. RETURNS (bytes): The serialized Lookups. """ return srsly.msgpack_dumps(self._tables) - def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def from_bytes(self, bytes_data, **kwargs): """Load the lookups from a bytestring. - exclude (list): String names of serialization fields to exclude. - RETURNS (bytes): The loaded Lookups. + bytes_data (bytes): The data to load. + RETURNS (Lookups): The loaded Lookups. """ self._tables = OrderedDict() msg = srsly.msgpack_loads(bytes_data) @@ -108,7 +107,8 @@ class Lookups(object): return self def to_disk(self, path, **kwargs): - """Save the lookups to a directory as lookups.bin. + """Save the lookups to a directory as lookups.bin. Expects a path to a + directory, which will be created if it doesn't exist. path (unicode / Path): The file path. """ @@ -121,9 +121,10 @@ class Lookups(object): file_.write(self.to_bytes()) def from_disk(self, path, **kwargs): - """Load lookups from a directory containing a lookups.bin. + """Load lookups from a directory containing a lookups.bin. Will skip + loading if the file doesn't exist. - path (unicode / Path): The file path. + path (unicode / Path): The directory path. RETURNS (Lookups): The loaded lookups. """ path = ensure_path(path) @@ -136,12 +137,18 @@ class Lookups(object): class Table(OrderedDict): - """A table in the lookups. Subclass of builtin dict that implements a + """A table in the lookups. Subclass of OrderedDict that implements a slightly more consistent and unified API. """ @classmethod def from_dict(cls, data, name=None): + """Initialize a new table from a dict. + + data (dict): The dictionary. + name (unicode): Optional table name for reference. + RETURNS (Table): The newly created object. + """ self = cls(name=name) self.update(data) return self diff --git a/website/docs/api/lookups.md b/website/docs/api/lookups.md new file mode 100644 index 000000000..ab65c4a0c --- /dev/null +++ b/website/docs/api/lookups.md @@ -0,0 +1,271 @@ +--- +title: Lookups +teaser: A container for large lookup tables and dictionaries +tag: class +source: spacy/lookups.py +new: 2.2 +--- + +This class allows convenient accesss to large lookup tables and dictionaries, +e.g. lemmatization data or tokenizer exception lists. Lookups are available via +the [`Vocab`](/api/vocab) as `vocab.lookups`, so they can be accessed before the +pipeline components are applied (e.g. in the tokenizer and lemmatizer), as well +as within the pipeline components via `doc.vocab.lookups`. + +## Lookups.\_\_init\_\_ {#init tag="method"} + +Create a `Lookups` object. + +> #### Example +> +> ```python +> from spacy.lookups import Lookups +> lookups = Lookups() +> ``` + +| Name | Type | Description | +| ----------- | --------- | ----------------------------- | +| **RETURNS** | `Lookups` | The newly constructed object. | + +## Lookups.\_\_len\_\_ {#len tag="method"} + +Get the current number of tables in the lookups. + +> #### Example +> +> ```python +> lookups = Lookups() +> assert len(lookups) == 0 +> ``` + +| Name | Type | Description | +| ----------- | ---- | ------------------------------------ | +| **RETURNS** | int | The number of tables in the lookups. | + +## Lookups.\_\contains\_\_ {#contains tag="method"} + +Check if the lookups contain a table of a given name. Delegates to +[`Lookups.has_table`](/api/lookups#has_table). + +> #### Example +> +> ```python +> lookups = Lookups() +> lookups.add_table("some_table") +> assert "some_table" in lookups +> ``` + +| Name | Type | Description | +| ----------- | ------- | ----------------------------------------------- | +| `name` | unicode | Name of the table. | +| **RETURNS** | bool | Whether a table of that name is in the lookups. | + +## Lookups.tables {#tables tag="property"} + +Get the names of all tables in the lookups. + +> #### Example +> +> ```python +> lookups = Lookups() +> lookups.add_table("some_table") +> assert lookups.tables == ["some_table"] +> ``` + +| Name | Type | Description | +| ----------- | ---- | ----------------------------------- | +| **RETURNS** | list | Names of the tables in the lookups. | + +## Lookups.add_table {#add_table tag="method"} + +Add a new table with optional data to the lookups. Raises an error if the table +exists. + +> #### Example +> +> ```python +> lookups = Lookups() +> lookups.add_table("some_table", {"foo": "bar"}) +> ``` + +| Name | Type | Description | +| ----------- | ----------------------------- | ---------------------------------- | +| `name` | unicode | Unique name of the table. | +| `data` | dict | Optional data to add to the table. | +| **RETURNS** | [`Table`](/api/lookups#table) | The newly added table. | + +## Lookups.get_table {#get_table tag="method"} + +Get a table from the lookups. Raises an error if the table doesn't exist. + +> #### Example +> +> ```python +> lookups = Lookups() +> lookups.add_table("some_table", {"foo": "bar"}) +> table = lookups.get_table("some_table") +> assert table["foo"] == "bar" +> ``` + +| Name | Type | Description | +| ----------- | ----------------------------- | ------------------ | +| `name` | unicode | Name of the table. | +| **RETURNS** | [`Table`](/api/lookups#table) | The table. | + +## Lookups.remove_table {#remove_table tag="method"} + +Remove a table from the lookups. Raises an error if the table doesn't exist. + +> #### Example +> +> ```python +> lookups = Lookups() +> lookups.add_table("some_table") +> removed_table = lookups.remove_table("some_table") +> assert "some_table" not in lookups +> ``` + +| Name | Type | Description | +| ----------- | ----------------------------- | ---------------------------- | +| `name` | unicode | Name of the table to remove. | +| **RETURNS** | [`Table`](/api/lookups#table) | The removed table. | + +## Lookups.has_table {#has_table tag="method"} + +Check if the lookups contain a table of a given name. Equivalent to +[`Lookups.__contains__`](/api/lookups#contains). + +> #### Example +> +> ```python +> lookups = Lookups() +> lookups.add_table("some_table") +> assert lookups.has_table("some_table") +> ``` + +| Name | Type | Description | +| ----------- | ------- | ----------------------------------------------- | +| `name` | unicode | Name of the table. | +| **RETURNS** | bool | Whether a table of that name is in the lookups. | + +## Lookups.to_bytes {#to_bytes tag="method"} + +Serialize the lookups to a bytestring. + +> #### Example +> +> ```python +> lookup_bytes = lookups.to_bytes() +> ``` + +| Name | Type | Description | +| ----------- | ----- | ----------------------- | +| **RETURNS** | bytes | The serialized lookups. | + +## Lookups.from_bytes {#from_bytes tag="method"} + +Load the lookups from a bytestring. + +> #### Example +> +> ```python +> lookup_bytes = lookups.to_bytes() +> lookups = Lookups() +> lookups.from_bytes(lookup_bytes) +> ``` + +| Name | Type | Description | +| ------------ | --------- | ---------------------- | +| `bytes_data` | bytes | The data to load from. | +| **RETURNS** | `Lookups` | The loaded lookups. | + +## Lookups.to_disk {#to_disk tag="method"} + +Save the lookups to a directory as `lookups.bin`. Expects a path to a directory, +which will be created if it doesn't exist. + +> #### Example +> +> ```python +> lookups.to_disk("/path/to/lookups") +> ``` + +| Name | Type | Description | +| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | + +## Lookups.from_disk {#from_disk tag="method"} + +Load lookups from a directory containing a `lookups.bin`. Will skip loading if +the file doesn't exist. + +> #### Example +> +> ```python +> from spacy.lookups import Lookups +> lookups = Lookups() +> lookups.from_disk("/path/to/lookups") +> ``` + +| Name | Type | Description | +| ----------- | ---------------- | -------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `Lookups` | The loaded lookups. | + +## Table {#table tag="class, ordererddict"} + +A table in the lookups. Subclass of `OrderedDict` that implements a slightly +more consistent and unified API. Supports all other methods and attributes of +`OrderedDict` / `dict`, and the customized methods listed here. + +### Table.\_\_init\_\_ {#table.init tag="method"} + +Initialize a new table. + +> #### Example +> +> ```python +> from spacy.lookups import Table +> table = Table(name="some_table") +> ``` + +| Name | Type | Description | +| ----------- | ------- | ---------------------------------- | +| `name` | unicode | Optional table name for reference. | +| **RETURNS** | `Table` | The newly constructed object. | + +### Table.from_dict {#table.from_dict tag="classmethod"} + +Initialize a new table from a dict. + +> #### Example +> +> ```python +> from spacy.lookups import Table +> data = {"foo": "bar", "baz": 100} +> table = Table.from_dict(data, name="some_table") +> ``` + +| Name | Type | Description | +| ----------- | ------- | ---------------------------------- | +| `data` | dict | The dictionary. | +| `name` | unicode | Optional table name for reference. | +| **RETURNS** | `Table` | The newly constructed object. | + +### Table.set {#table.set tag="key"} + +Set a new key / value pair. Same as `table[key] = value`. + +> #### Example +> +> ```python +> from spacy.lookups import Table +> table = Table() +> table.set("foo", "bar") +> assert table["foo"] == "bar" +> ``` + +| Name | Type | Description | +| ------- | ------- | ----------- | +| `key` | unicode | The key. | +| `value` | - | The value. | diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index cd21a91d6..22bfe324e 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -293,6 +293,7 @@ Load state from a binary string. | `strings` | `StringStore` | A table managing the string-to-int mapping. | | `vectors` <Tag variant="new">2</Tag> | `Vectors` | A table associating word IDs to word vectors. | | `vectors_length` | int | Number of dimensions for each word vector. | +| `lookups` | `Lookups` | The available lookup tables in this vocab. | | `writing_system` <Tag variant="new">2.1</Tag> | dict | A dict with information about the language's writing system. | ## Serialization fields {#serialization-fields} @@ -313,3 +314,4 @@ serialization by passing in the string names via the `exclude` argument. | `strings` | The strings in the [`StringStore`](/api/stringstore). | | `lexemes` | The lexeme data. | | `vectors` | The word vectors, if available. | +| `lookups` | The lookup tables, if available. | diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 3c4f09674..a05440e5a 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -90,6 +90,7 @@ { "text": "Vocab", "url": "/api/vocab" }, { "text": "StringStore", "url": "/api/stringstore" }, { "text": "Vectors", "url": "/api/vectors" }, + { "text": "Lookups", "url": "/api/lookups" }, { "text": "KnowledgeBase", "url": "/api/kb" }, { "text": "GoldParse", "url": "/api/goldparse" }, { "text": "GoldCorpus", "url": "/api/goldcorpus" }, From c0a4cab17887d14655659b381ea4ae5e062a5108 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 12 Sep 2019 14:53:06 +0200 Subject: [PATCH 142/207] Update "Adding languages" docs [ci skip] --- website/docs/usage/adding-languages.md | 131 ++++++++++--------------- 1 file changed, 50 insertions(+), 81 deletions(-) diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md index 374d948b2..6f8955326 100644 --- a/website/docs/usage/adding-languages.md +++ b/website/docs/usage/adding-languages.md @@ -71,21 +71,19 @@ from the global rules. Others, like the tokenizer and norm exceptions, are very specific and will make a big difference to spaCy's performance on the particular language and training a language model. -| Variable | Type | Description | -| ----------------------------------------- | ----- | ---------------------------------------------------------------------------------------------------------- | -| `STOP_WORDS` | set | Individual words. | -| `TOKENIZER_EXCEPTIONS` | dict | Keyed by strings mapped to list of one dict per token with token attributes. | -| `TOKEN_MATCH` | regex | Regexes to match complex tokens, e.g. URLs. | -| `NORM_EXCEPTIONS` | dict | Keyed by strings, mapped to their norms. | -| `TOKENIZER_PREFIXES` | list | Strings or regexes, usually not customized. | -| `TOKENIZER_SUFFIXES` | list | Strings or regexes, usually not customized. | -| `TOKENIZER_INFIXES` | list | Strings or regexes, usually not customized. | -| `LEX_ATTRS` | dict | Attribute ID mapped to function. | -| `SYNTAX_ITERATORS` | dict | Iterator ID mapped to function. Currently only supports `'noun_chunks'`. | -| `LOOKUP` | dict | Keyed by strings mapping to their lemma. | -| `LEMMA_RULES`, `LEMMA_INDEX`, `LEMMA_EXC` | dict | Lemmatization rules, keyed by part of speech. | -| `TAG_MAP` | dict | Keyed by strings mapped to [Universal Dependencies](http://universaldependencies.org/u/pos/all.html) tags. | -| `MORPH_RULES` | dict | Keyed by strings mapped to a dict of their morphological features. | +| Variable | Type | Description | +| ---------------------- | ----- | ---------------------------------------------------------------------------------------------------------- | +| `STOP_WORDS` | set | Individual words. | +| `TOKENIZER_EXCEPTIONS` | dict | Keyed by strings mapped to list of one dict per token with token attributes. | +| `TOKEN_MATCH` | regex | Regexes to match complex tokens, e.g. URLs. | +| `NORM_EXCEPTIONS` | dict | Keyed by strings, mapped to their norms. | +| `TOKENIZER_PREFIXES` | list | Strings or regexes, usually not customized. | +| `TOKENIZER_SUFFIXES` | list | Strings or regexes, usually not customized. | +| `TOKENIZER_INFIXES` | list | Strings or regexes, usually not customized. | +| `LEX_ATTRS` | dict | Attribute ID mapped to function. | +| `SYNTAX_ITERATORS` | dict | Iterator ID mapped to function. Currently only supports `'noun_chunks'`. | +| `TAG_MAP` | dict | Keyed by strings mapped to [Universal Dependencies](http://universaldependencies.org/u/pos/all.html) tags. | +| `MORPH_RULES` | dict | Keyed by strings mapped to a dict of their morphological features. | > #### Should I ever update the global data? > @@ -213,9 +211,7 @@ spaCy's [tokenization algorithm](/usage/linguistic-features#how-tokenizer-works) lets you deal with whitespace-delimited chunks separately. This makes it easy to define special-case rules, without worrying about how they interact with the rest of the tokenizer. Whenever the key string is matched, the special-case rule -is applied, giving the defined sequence of tokens. You can also attach -attributes to the subtokens, covered by your special case, such as the subtokens -`LEMMA` or `TAG`. +is applied, giving the defined sequence of tokens. Tokenizer exceptions can be added in the following format: @@ -223,8 +219,8 @@ Tokenizer exceptions can be added in the following format: ### tokenizer_exceptions.py (excerpt) TOKENIZER_EXCEPTIONS = { "don't": [ - {ORTH: "do", LEMMA: "do"}, - {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}] + {ORTH: "do"}, + {ORTH: "n't", NORM: "not"}] } ``` @@ -233,41 +229,12 @@ TOKENIZER_EXCEPTIONS = { If an exception consists of more than one token, the `ORTH` values combined always need to **match the original string**. The way the original string is split up can be pretty arbitrary sometimes – for example `"gonna"` is split into -`"gon"` (lemma "go") and `"na"` (lemma "to"). Because of how the tokenizer +`"gon"` (norm "going") and `"na"` (norm "to"). Because of how the tokenizer works, it's currently not possible to split single-letter strings into multiple tokens. </Infobox> -Unambiguous abbreviations, like month names or locations in English, should be -added to exceptions with a lemma assigned, for example -`{ORTH: "Jan.", LEMMA: "January"}`. Since the exceptions are added in Python, -you can use custom logic to generate them more efficiently and make your data -less verbose. How you do this ultimately depends on the language. Here's an -example of how exceptions for time formats like "1a.m." and "1am" are generated -in the English -[`tokenizer_exceptions.py`](https://github.com/explosion/spaCy/tree/master/spacy/en/lang/tokenizer_exceptions.py): - -```python -### tokenizer_exceptions.py (excerpt) -# use short, internal variable for readability -_exc = {} - -for h in range(1, 12 + 1): - for period in ["a.m.", "am"]: - # always keep an eye on string interpolation! - _exc["%d%s" % (h, period)] = [ - {ORTH: "%d" % h}, - {ORTH: period, LEMMA: "a.m."}] - for period in ["p.m.", "pm"]: - _exc["%d%s" % (h, period)] = [ - {ORTH: "%d" % h}, - {ORTH: period, LEMMA: "p.m."}] - -# only declare this at the bottom -TOKENIZER_EXCEPTIONS = _exc -``` - > #### Generating tokenizer exceptions > > Keep in mind that generating exceptions only makes sense if there's a clearly @@ -275,7 +242,8 @@ TOKENIZER_EXCEPTIONS = _exc > This is not always the case – in Spanish for instance, infinitive or > imperative reflexive verbs and pronouns are one token (e.g. "vestirme"). In > cases like this, spaCy shouldn't be generating exceptions for _all verbs_. -> Instead, this will be handled at a later stage during lemmatization. +> Instead, this will be handled at a later stage after part-of-speech tagging +> and lemmatization. When adding the tokenizer exceptions to the `Defaults`, you can use the [`update_exc`](/api/top-level#util.update_exc) helper function to merge them @@ -292,28 +260,18 @@ custom one. from ...util import update_exc BASE_EXCEPTIONS = {"a.": [{ORTH: "a."}], ":)": [{ORTH: ":)"}]} -TOKENIZER_EXCEPTIONS = {"a.": [{ORTH: "a.", LEMMA: "all"}]} +TOKENIZER_EXCEPTIONS = {"a.": [{ORTH: "a.", NORM: "all"}]} tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) -# {"a.": [{ORTH: "a.", LEMMA: "all"}], ":)": [{ORTH: ":)"}]} +# {"a.": [{ORTH: "a.", NORM: "all"}], ":)": [{ORTH: ":)"}]} ``` -<Infobox title="About spaCy's custom pronoun lemma" variant="warning"> - -Unlike verbs and common nouns, there's no clear base form of a personal pronoun. -Should the lemma of "me" be "I", or should we normalize person as well, giving -"it" — or maybe "he"? spaCy's solution is to introduce a novel symbol, `-PRON-`, -which is used as the lemma for all personal pronouns. - -</Infobox> - ### Norm exceptions {#norm-exceptions new="2"} -In addition to `ORTH` or `LEMMA`, tokenizer exceptions can also set a `NORM` -attribute. This is useful to specify a normalized version of the token – for -example, the norm of "n't" is "not". By default, a token's norm equals its -lowercase text. If the lowercase spelling of a word exists, norms should always -be in lowercase. +In addition to `ORTH`, tokenizer exceptions can also set a `NORM` attribute. +This is useful to specify a normalized version of the token – for example, the +norm of "n't" is "not". By default, a token's norm equals its lowercase text. If +the lowercase spelling of a word exists, norms should always be in lowercase. > #### Norms vs. lemmas > @@ -458,25 +416,36 @@ the quickest and easiest way to get started. The data is stored in a dictionary mapping a string to its lemma. To determine a token's lemma, spaCy simply looks it up in the table. Here's an example from the Spanish language data: -```python -### lang/es/lemmatizer.py (excerpt) -LOOKUP = { - "aba": "abar", - "ababa": "abar", - "ababais": "abar", - "ababan": "abar", - "ababanes": "ababán", - "ababas": "abar", - "ababoles": "ababol", - "ababábites": "ababábite" +```json +### lang/es/lemma_lookup.json (excerpt) +{ + "aba": "abar", + "ababa": "abar", + "ababais": "abar", + "ababan": "abar", + "ababanes": "ababán", + "ababas": "abar", + "ababoles": "ababol", + "ababábites": "ababábite" } ``` -To provide a lookup lemmatizer for your language, import the lookup table and -add it to the `Language` class as `lemma_lookup`: +#### Adding JSON resources {#lemmatizer-resources new="2.2"} + +As of v2.2, resources for the lemmatizer are stored as JSON and loaded via the +new [`Lookups`](/api/lookups) class. This allows easier access to the data, +serialization with the models and file compression on disk (so your spaCy +installation is smaller). Resource files can be provided via the `resources` +attribute on the custom language subclass. All paths are relative to the +language data directory, i.e. the directory the language's `__init__.py` is in. ```python -lemma_lookup = LOOKUP +resources = { + "lemma_lookup": "lemmatizer/lemma_lookup.json", + "lemma_rules": "lemmatizer/lemma_rules.json", + "lemma_index": "lemmatizer/lemma_index.json", + "lemma_exc": "lemmatizer/lemma_exc.json", +} ``` ### Tag map {#tag-map} From 72274e83f2dbc7b77110b462bef83540ddd50fe6 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 12 Sep 2019 15:24:17 +0200 Subject: [PATCH 143/207] Ensure accordion label is left-aligned [ci skip] --- website/src/styles/accordion.module.sass | 1 + 1 file changed, 1 insertion(+) diff --git a/website/src/styles/accordion.module.sass b/website/src/styles/accordion.module.sass index 707e29aef..bdcbba9ac 100644 --- a/website/src/styles/accordion.module.sass +++ b/website/src/styles/accordion.module.sass @@ -13,6 +13,7 @@ width: 100% padding: 1rem 1.5rem border-radius: var(--border-radius) + text-align: left &:focus background: var(--color-theme-opaque) From 05a2df66160be0740c798c6aaf4a116dab76d7b6 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 12 Sep 2019 15:26:02 +0200 Subject: [PATCH 144/207] Remove not implemented file validation [ci skip] --- spacy/cli/debug_data.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 0a9a0f7ef..259ef6d94 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -34,12 +34,6 @@ BLANK_MODEL_THRESHOLD = 2000 str, ), ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool), - ignore_validation=( - "Don't exit if JSON format validation fails", - "flag", - "IV", - bool, - ), verbose=("Print additional information and explanations", "flag", "V", bool), no_format=("Don't pretty-print the results", "flag", "NF", bool), ) @@ -50,7 +44,6 @@ def debug_data( base_model=None, pipeline="tagger,parser,ner", ignore_warnings=False, - ignore_validation=False, verbose=False, no_format=False, ): @@ -72,21 +65,9 @@ def debug_data( msg.divider("Data format validation") - # Validate data format using the JSON schema + # TODO: Validate data format using the JSON schema # TODO: update once the new format is ready # TODO: move validation to GoldCorpus in order to be able to load from dir - train_data_errors = [] # TODO: validate_json - dev_data_errors = [] # TODO: validate_json - if not train_data_errors: - msg.good("Training data JSON format is valid") - if not dev_data_errors: - msg.good("Development data JSON format is valid") - for error in train_data_errors: - msg.fail("Training data: {}".format(error)) - for error in dev_data_errors: - msg.fail("Develoment data: {}".format(error)) - if (train_data_errors or dev_data_errors) and not ignore_validation: - sys.exit(1) # Create the gold corpus to be able to better analyze data loading_train_error_message = "" From b544dcb3c523d260c84431ea4fc07330e46b8790 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 12 Sep 2019 15:26:20 +0200 Subject: [PATCH 145/207] Document debug-data [ci skip] --- spacy/cli/debug_data.py | 5 + website/docs/api/cli.md | 223 ++++++++++++++++++++++++++++----- website/docs/usage/training.md | 99 ++++++++------- 3 files changed, 252 insertions(+), 75 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 259ef6d94..aac4d5b97 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -47,6 +47,11 @@ def debug_data( verbose=False, no_format=False, ): + """ + Analyze, debug and validate your training and development data, get useful + stats, and find problems like invalid entity annotations, cyclic + dependencies, low data labels and more. + """ msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings) # Make sure all files and paths exists if they are needed diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index d01637925..d13490a27 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -8,6 +8,7 @@ menu: - ['Info', 'info'] - ['Validate', 'validate'] - ['Convert', 'convert'] + - ['Debug data', 'debug-data'] - ['Train', 'train'] - ['Pretrain', 'pretrain'] - ['Init Model', 'init-model'] @@ -174,12 +175,172 @@ All output files generated by this command are compatible with <!-- TODO: document jsonl option – maybe update it? --> -| ID | Description | -| ------------------------------ | --------------------------------------------------------------- | -| `auto` | Automatically pick converter based on file extension and file content (default). | -| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. | +| ID | Description | +| ------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `auto` | Automatically pick converter based on file extension and file content (default). | +| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. | | `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | -| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | +| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | + +## Debug data {#debug-data new="2.2"} + +Analyze, debug and validate your training and development data, get useful +stats, and find problems like invalid entity annotations, cyclic dependencies, +low data labels and more. + +```bash +$ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pipeline] [--ignore-warnings] [--verbose] [--no-format] +``` + +| Argument | Type | Description | +| -------------------------- | ---------- | -------------------------------------------------------------------------------------------------- | +| `lang` | positional | Model language. | +| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | +| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | +| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. | +| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | +| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | +| `--verbose`, `-V` | flag | Print additional information and explanations. | +| --no-format, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. | + +<Accordion title="Example output"> + +``` +=========================== Data format validation =========================== +✔ Corpus is loadable + +=============================== Training stats =============================== +Training pipeline: tagger, parser, ner +Starting with blank model 'en' +18127 training docs +2939 evaluation docs +⚠ 34 training examples also in evaluation data + +============================== Vocab & Vectors ============================== +ℹ 2083156 total words in the data (56962 unique) +⚠ 13020 misaligned tokens in the training data +⚠ 2423 misaligned tokens in the dev data +10 most common words: 'the' (98429), ',' (91756), '.' (87073), 'to' (50058), +'of' (49559), 'and' (44416), 'a' (34010), 'in' (31424), 'that' (22792), 'is' +(18952) +ℹ No word vectors present in the model + +========================== Named Entity Recognition ========================== +ℹ 18 new labels, 0 existing labels +528978 missing values (tokens with '-' label) +New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL' +(10490), 'NORP' (9033), 'MONEY' (5164), 'PERCENT' (3761), 'ORDINAL' (2122), +'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC' +(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338) +✔ Good amount of examples for all labels +✔ Examples without occurences available for all labels +✔ No entities consisting of or starting/ending with whitespace + +=========================== Part-of-speech Tagging =========================== +ℹ 49 labels in data (57 labels in tag map) +'NN' (266331), 'IN' (227365), 'DT' (185600), 'NNP' (164404), 'JJ' (119830), +'NNS' (110957), '.' (101482), ',' (92476), 'RB' (90090), 'PRP' (90081), 'VB' +(74538), 'VBD' (68199), 'CC' (62862), 'VBZ' (50712), 'VBP' (43420), 'VBN' +(42193), 'CD' (40326), 'VBG' (34764), 'TO' (31085), 'MD' (25863), 'PRP$' +(23335), 'HYPH' (13833), 'POS' (13427), 'UH' (13322), 'WP' (10423), 'WDT' +(9850), 'RP' (8230), 'WRB' (8201), ':' (8168), '''' (7392), '``' (6984), 'NNPS' +(5817), 'JJR' (5689), '$' (3710), 'EX' (3465), 'JJS' (3118), 'RBR' (2872), +'-RRB-' (2825), '-LRB-' (2788), 'PDT' (2078), 'XX' (1316), 'RBS' (1142), 'FW' +(794), 'NFP' (557), 'SYM' (440), 'WP$' (294), 'LS' (293), 'ADD' (191), 'AFX' +(24) +✔ All labels present in tag map for language 'en' + +============================= Dependency Parsing ============================= +ℹ Found 111703 sentences with an average length of 18.6 words. +ℹ Found 2251 nonprojective train sentences +ℹ Found 303 nonprojective dev sentences +ℹ 47 labels in train data +ℹ 211 labels in projectivized train data +'punct' (236796), 'prep' (188853), 'pobj' (182533), 'det' (172674), 'nsubj' +(169481), 'compound' (116142), 'ROOT' (111697), 'amod' (107945), 'dobj' (93540), +'aux' (86802), 'advmod' (86197), 'cc' (62679), 'conj' (59575), 'poss' (36449), +'ccomp' (36343), 'advcl' (29017), 'mark' (27990), 'nummod' (24582), 'relcl' +(21359), 'xcomp' (21081), 'attr' (18347), 'npadvmod' (17740), 'acomp' (17204), +'auxpass' (15639), 'appos' (15368), 'neg' (15266), 'nsubjpass' (13922), 'case' +(13408), 'acl' (12574), 'pcomp' (10340), 'nmod' (9736), 'intj' (9285), 'prt' +(8196), 'quantmod' (7403), 'dep' (4300), 'dative' (4091), 'agent' (3908), 'expl' +(3456), 'parataxis' (3099), 'oprd' (2326), 'predet' (1946), 'csubj' (1494), +'subtok' (1147), 'preconj' (692), 'meta' (469), 'csubjpass' (64), 'iobj' (1) +⚠ Low number of examples for label 'iobj' (1) +⚠ Low number of examples for 130 labels in the projectivized dependency +trees used for training. You may want to projectivize labels such as punct +before training in order to improve parser performance. +⚠ Projectivized labels with low numbers of examples: appos||attr: 12 +advmod||dobj: 13 prep||ccomp: 12 nsubjpass||ccomp: 15 pcomp||prep: 14 +amod||dobj: 9 attr||xcomp: 14 nmod||nsubj: 17 prep||advcl: 2 prep||prep: 5 +nsubj||conj: 12 advcl||advmod: 18 ccomp||advmod: 11 ccomp||pcomp: 5 acl||pobj: +10 npadvmod||acomp: 7 dobj||pcomp: 14 nsubjpass||pcomp: 1 nmod||pobj: 8 +amod||attr: 6 nmod||dobj: 12 aux||conj: 1 neg||conj: 1 dative||xcomp: 11 +pobj||dative: 3 xcomp||acomp: 19 advcl||pobj: 2 nsubj||advcl: 2 csubj||ccomp: 1 +advcl||acl: 1 relcl||nmod: 2 dobj||advcl: 10 advmod||advcl: 3 nmod||nsubjpass: 6 +amod||pobj: 5 cc||neg: 1 attr||ccomp: 16 advcl||xcomp: 3 nmod||attr: 4 +advcl||nsubjpass: 5 advcl||ccomp: 4 ccomp||conj: 1 punct||acl: 1 meta||acl: 1 +parataxis||acl: 1 prep||acl: 1 amod||nsubj: 7 ccomp||ccomp: 3 acomp||xcomp: 5 +dobj||acl: 5 prep||oprd: 6 advmod||acl: 2 dative||advcl: 1 pobj||agent: 5 +xcomp||amod: 1 dep||advcl: 1 prep||amod: 8 relcl||compound: 1 advcl||csubj: 3 +npadvmod||conj: 2 npadvmod||xcomp: 4 advmod||nsubj: 3 ccomp||amod: 7 +advcl||conj: 1 nmod||conj: 2 advmod||nsubjpass: 2 dep||xcomp: 2 appos||ccomp: 1 +advmod||dep: 1 advmod||advmod: 5 aux||xcomp: 8 dep||advmod: 1 dative||ccomp: 2 +prep||dep: 1 conj||conj: 1 dep||ccomp: 4 cc||ROOT: 1 prep||ROOT: 1 nsubj||pcomp: +3 advmod||prep: 2 relcl||dative: 1 acl||conj: 1 advcl||attr: 4 prep||npadvmod: 1 +nsubjpass||xcomp: 1 neg||advmod: 1 xcomp||oprd: 1 advcl||advcl: 1 dobj||dep: 3 +nsubjpass||parataxis: 1 attr||pcomp: 1 ccomp||parataxis: 1 advmod||attr: 1 +nmod||oprd: 1 appos||nmod: 2 advmod||relcl: 1 appos||npadvmod: 1 appos||conj: 1 +prep||expl: 1 nsubjpass||conj: 1 punct||pobj: 1 cc||pobj: 1 conj||pobj: 1 +punct||conj: 1 ccomp||dep: 1 oprd||xcomp: 3 ccomp||xcomp: 1 ccomp||nsubj: 1 +nmod||dep: 1 xcomp||ccomp: 1 acomp||advcl: 1 intj||advmod: 1 advmod||acomp: 2 +relcl||oprd: 1 advmod||prt: 1 advmod||pobj: 1 appos||nummod: 1 relcl||npadvmod: +3 mark||advcl: 1 aux||ccomp: 1 amod||nsubjpass: 1 npadvmod||advmod: 1 conj||dep: +1 nummod||pobj: 1 amod||npadvmod: 1 intj||pobj: 1 nummod||npadvmod: 1 +xcomp||xcomp: 1 aux||dep: 1 advcl||relcl: 1 +⚠ The following labels were found only in the train data: xcomp||amod, +advcl||relcl, prep||nsubjpass, acl||nsubj, nsubjpass||conj, xcomp||oprd, +advmod||conj, advmod||advmod, iobj, advmod||nsubjpass, dobj||conj, ccomp||amod, +meta||acl, xcomp||xcomp, prep||attr, prep||ccomp, advcl||acomp, acl||dobj, +advcl||advcl, pobj||agent, prep||advcl, nsubjpass||xcomp, prep||dep, +acomp||xcomp, aux||ccomp, ccomp||dep, conj||dep, relcl||compound, +nsubjpass||ccomp, nmod||dobj, advmod||advcl, advmod||acl, dobj||advcl, +dative||xcomp, prep||nsubj, ccomp||ccomp, nsubj||ccomp, xcomp||acomp, +prep||acomp, dep||advmod, acl||pobj, appos||dobj, npadvmod||acomp, cc||ROOT, +relcl||nsubj, nmod||pobj, acl||nsubjpass, ccomp||advmod, pcomp||prep, +amod||dobj, advmod||attr, advcl||csubj, appos||attr, dobj||pcomp, prep||ROOT, +relcl||pobj, advmod||pobj, amod||nsubj, ccomp||xcomp, prep||oprd, +npadvmod||advmod, appos||nummod, advcl||pobj, neg||advmod, acl||attr, +appos||nsubjpass, csubj||ccomp, amod||nsubjpass, intj||pobj, dep||advcl, +cc||neg, xcomp||ccomp, dative||ccomp, nmod||oprd, pobj||dative, prep||dobj, +dep||ccomp, relcl||attr, ccomp||nsubj, advcl||xcomp, nmod||dep, advcl||advmod, +ccomp||conj, pobj||prep, advmod||acomp, advmod||relcl, attr||pcomp, +ccomp||parataxis, oprd||xcomp, intj||advmod, nmod||nsubjpass, prep||npadvmod, +parataxis||acl, prep||pobj, advcl||dobj, amod||pobj, prep||acl, conj||pobj, +advmod||dep, punct||pobj, ccomp||acomp, acomp||advcl, nummod||npadvmod, +dobj||dep, npadvmod||xcomp, advcl||conj, relcl||npadvmod, punct||acl, +relcl||dobj, dobj||xcomp, nsubjpass||parataxis, dative||advcl, relcl||nmod, +advcl||ccomp, appos||npadvmod, ccomp||pcomp, prep||amod, mark||advcl, +prep||advmod, prep||xcomp, appos||nsubj, attr||ccomp, advmod||prt, dobj||ccomp, +aux||conj, advcl||nsubj, conj||conj, advmod||ccomp, advcl||nsubjpass, +attr||xcomp, nmod||conj, npadvmod||conj, relcl||dative, prep||expl, +nsubjpass||pcomp, advmod||xcomp, advmod||dobj, appos||pobj, nsubj||conj, +relcl||nsubjpass, advcl||attr, appos||ccomp, advmod||prep, prep||conj, +nmod||attr, punct||conj, neg||conj, dep||xcomp, aux||xcomp, dobj||acl, +nummod||pobj, amod||npadvmod, nsubj||pcomp, advcl||acl, appos||nmod, +relcl||oprd, prep||prep, cc||pobj, nmod||nsubj, amod||attr, aux||dep, +appos||conj, advmod||nsubj, nsubj||advcl, acl||conj +To train a parser, your data should include at least 20 instances of each label. +⚠ Multiple root labels (ROOT, nsubj, aux, npadvmod, prep) found in +training data. spaCy's parser uses a single root label ROOT so this distinction +will not be available. + +================================== Summary ================================== +✔ 5 checks passed +⚠ 8 warnings +``` + +</Accordion> ## Train {#train} @@ -226,7 +387,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` | | `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. | | `--gold-preproc`, `-G` | flag | Use gold preprocessing. | -| `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging subtokens. Typically used for languages like Chinese. | +| `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging subtokens. Typically used for languages like Chinese. | | `--verbose`, `-VV` <Tag variant="new">2.0.13</Tag> | flag | Show more detailed messages during training. | | `--help`, `-h` | flag | Show help message and available arguments. | | **CREATES** | model, pickle | A spaCy model on each epoch. | @@ -291,26 +452,26 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] [--seed] [--n-iter] [--use-vectors] [--n-save_every] [--init-tok2vec] [--epoch-start] ``` -| Argument | Type | Description | -| ----------------------- | ---------- | --------------------------------------------------------------------------------------------------------------------------------- | -| `texts_loc` | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](#pretrain-jsonl) for details. | -| `vectors_model` | positional | Name or path to spaCy model with vectors to learn from. | -| `output_dir` | positional | Directory to write models to on each epoch. | -| `--width`, `-cw` | option | Width of CNN layers. | -| `--depth`, `-cd` | option | Depth of CNN layers. | -| `--embed-rows`, `-er` | option | Number of embedding rows. | -| `--loss-func`, `-L` | option | Loss function to use for the objective. Either `"L2"` or `"cosine"`. | -| `--dropout`, `-d` | option | Dropout rate. | -| `--batch-size`, `-bs` | option | Number of words per training batch. | -| `--max-length`, `-xw` | option | Maximum words per example. Longer examples are discarded. | -| `--min-length`, `-nw` | option | Minimum words per example. Shorter examples are discarded. | -| `--seed`, `-s` | option | Seed for random number generators. | -| `--n-iter`, `-i` | option | Number of iterations to pretrain. | -| `--use-vectors`, `-uv` | flag | Whether to use the static vectors as input features. | -| `--n-save-every`, `-se` | option | Save model every X batches. | -| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental.| -| `--epoch-start`, `-es` <Tag variant="new">2.1.5</Tag> | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.| -| **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. | +| Argument | Type | Description | +| ----------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `texts_loc` | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](#pretrain-jsonl) for details. | +| `vectors_model` | positional | Name or path to spaCy model with vectors to learn from. | +| `output_dir` | positional | Directory to write models to on each epoch. | +| `--width`, `-cw` | option | Width of CNN layers. | +| `--depth`, `-cd` | option | Depth of CNN layers. | +| `--embed-rows`, `-er` | option | Number of embedding rows. | +| `--loss-func`, `-L` | option | Loss function to use for the objective. Either `"L2"` or `"cosine"`. | +| `--dropout`, `-d` | option | Dropout rate. | +| `--batch-size`, `-bs` | option | Number of words per training batch. | +| `--max-length`, `-xw` | option | Maximum words per example. Longer examples are discarded. | +| `--min-length`, `-nw` | option | Minimum words per example. Shorter examples are discarded. | +| `--seed`, `-s` | option | Seed for random number generators. | +| `--n-iter`, `-i` | option | Number of iterations to pretrain. | +| `--use-vectors`, `-uv` | flag | Whether to use the static vectors as input features. | +| `--n-save-every`, `-se` | option | Save model every X batches. | +| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. | +| `--epoch-start`, `-es` <Tag variant="new">2.1.5</Tag> | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files. | +| **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. | ### JSONL format for raw text {#pretrain-jsonl} @@ -330,10 +491,10 @@ tokenization can be provided. > srsly.write_jsonl("/path/to/text.jsonl", data) > ``` -| Key | Type | Description | -| -------- | ------- | -------------------------------------------- | +| Key | Type | Description | +| -------- | ------- | ---------------------------------------------------------- | | `text` | unicode | The raw input text. Is not required if `tokens` available. | -| `tokens` | list | Optional tokenization, one string per token. | +| `tokens` | list | Optional tokenization, one string per token. | ```json ### Example @@ -424,7 +585,7 @@ pip install dist/en_model-0.0.0.tar.gz | `input_dir` | positional | Path to directory containing model data. | | `output_dir` | positional | Directory to create package folder in. | | `--meta-path`, `-m` <Tag variant="new">2</Tag> | option | Path to `meta.json` file (optional). | -| `--create-meta`, `-c` <Tag variant="new">2</Tag> | flag | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. -| `--force`, `-f` | flag | Force overwriting of existing folder in output directory. | +| `--create-meta`, `-c` <Tag variant="new">2</Tag> | flag | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. | +| `--force`, `-f` | flag | Force overwriting of existing folder in output directory. | | `--help`, `-h` | flag | Show help message and available arguments. | | **CREATES** | directory | A Python package containing the spaCy model. | diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index dd5cd8530..e3386a64f 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -10,9 +10,9 @@ menu: --- This guide describes how to train new statistical models for spaCy's -part-of-speech tagger, named entity recognizer, dependency parser, -text classifier and entity linker. Once the model is trained, -you can then [save and load](/usage/saving-loading#models) it. +part-of-speech tagger, named entity recognizer, dependency parser, text +classifier and entity linker. Once the model is trained, you can then +[save and load](/usage/saving-loading#models) it. ## Training basics {#basics} @@ -40,6 +40,19 @@ mkdir models python -m spacy train es models ancora-json/es_ancora-ud-train.json ancora-json/es_ancora-ud-dev.json ``` +<Infobox title="Tip: Debug your data"> + +If you're running spaCy v2.2 or above, you can use the +[`debug-data` command](/api/cli#debug-data) to analyze and validate your +training and development data, get useful stats, and find problems like invalid +entity annotations, cyclic dependencies, low data labels and more. + +```bash +$ python -m spacy debug-data en train.json dev.json --verbose +``` + +</Infobox> + You can also use the [`gold.docs_to_json`](/api/goldparse#docs_to_json) helper to convert a list of `Doc` objects to spaCy's JSON training format. @@ -222,11 +235,10 @@ of being dropped. > - [`begin_training()`](/api/language#begin_training): Start the training and > return an optimizer function to update the model's weights. Can take an -> optional function converting the training data to spaCy's training format. -> - [`update()`](/api/language#update): Update the model with the -> training example and gold data. -> - [`to_disk()`](/api/language#to_disk): Save -> the updated model to a directory. +> optional function converting the training data to spaCy's training format. +> - [`update()`](/api/language#update): Update the model with the training +> example and gold data. +> - [`to_disk()`](/api/language#to_disk): Save the updated model to a directory. ```python ### Example training loop @@ -405,19 +417,20 @@ referred to as the "catastrophic forgetting" problem. ## Entity linking {#entity-linker} -To train an entity linking model, you first need to define a knowledge base (KB). +To train an entity linking model, you first need to define a knowledge base +(KB). ### Creating a knowledge base {#kb} -A KB consists of a list of entities with unique identifiers. Each such entity -has an entity vector that will be used to measure similarity with the context in -which an entity is used. These vectors are pretrained and stored in the KB before -the entity linking model will be trained. +A KB consists of a list of entities with unique identifiers. Each such entity +has an entity vector that will be used to measure similarity with the context in +which an entity is used. These vectors are pretrained and stored in the KB +before the entity linking model will be trained. -The following example shows how to build a knowledge base from scratch, -given a list of entities and potential aliases. The script further demonstrates -how to pretrain and store the entity vectors. To run this example, the script -needs access to a `vocab` instance or an `nlp` model with pretrained word embeddings. +The following example shows how to build a knowledge base from scratch, given a +list of entities and potential aliases. The script further demonstrates how to +pretrain and store the entity vectors. To run this example, the script needs +access to a `vocab` instance or an `nlp` model with pretrained word embeddings. ```python https://github.com/explosion/spaCy/tree/master/examples/training/pretrain_kb.py @@ -428,22 +441,22 @@ https://github.com/explosion/spaCy/tree/master/examples/training/pretrain_kb.py 1. **Load the model** you want to start with, or create an **empty model** using [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language and a pre-defined [`vocab`](/api/vocab) object. -2. **Pretrain the entity embeddings** by running the descriptions of the entities - through a simple encoder-decoder network. The current implementation requires - the `nlp` model to have access to pre-trained word embeddings, but a custom - implementation of this enoding step can also be used. +2. **Pretrain the entity embeddings** by running the descriptions of the + entities through a simple encoder-decoder network. The current implementation + requires the `nlp` model to have access to pre-trained word embeddings, but a + custom implementation of this enoding step can also be used. 3. **Construct the KB** by defining all entities with their pretrained vectors, - and all aliases with their prior probabilities. + and all aliases with their prior probabilities. 4. **Save** the KB using [`kb.dump`](/api/kb#dump). 5. **Test** the KB to make sure the entities were added correctly. ### Training an entity linking model {#entity-linker-model} -This example shows how to create an entity linker pipe using a previously created -knowledge base. The entity linker pipe is then trained with your own -examples. To do so, you'll need to provide -**example texts**, and the **character offsets** and **knowledge base identifiers** -of each entity contained in the texts. +This example shows how to create an entity linker pipe using a previously +created knowledge base. The entity linker pipe is then trained with your own +examples. To do so, you'll need to provide **example texts**, and the +**character offsets** and **knowledge base identifiers** of each entity +contained in the texts. ```python https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_linker.py @@ -451,25 +464,23 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_li #### Step by step guide {#step-by-step-entity-linker} -1. **Load the KB** you want to start with, and specify the path - to the `Vocab` object that was used to create this KB. - Then, create an **empty model** using - [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language. - Don't forget to add the KB to the entity linker, - and to add the entity linker to the pipeline. - In practical applications, you will want a more advanced pipeline including - also a component for [named entity recognition](/usage/training#ner). - If you're using a model with additional components, make sure to disable all other - pipeline components during training using - [`nlp.disable_pipes`](/api/language#disable_pipes). This way, you'll only be - training the entity linker. +1. **Load the KB** you want to start with, and specify the path to the `Vocab` + object that was used to create this KB. Then, create an **empty model** using + [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language. + Don't forget to add the KB to the entity linker, and to add the entity linker + to the pipeline. In practical applications, you will want a more advanced + pipeline including also a component for + [named entity recognition](/usage/training#ner). If you're using a model with + additional components, make sure to disable all other pipeline components + during training using [`nlp.disable_pipes`](/api/language#disable_pipes). + This way, you'll only be training the entity linker. 2. **Shuffle and loop over** the examples. For each example, **update the model** by calling [`nlp.update`](/api/language#update), which steps through - the annotated examples of the input. For each combination of a mention in text and - a potential KB identifier, the model makes a **prediction** whether or not - this is the correct match. It then - consults the annotations to see whether it was right. If it was wrong, it - adjusts its weights so that the correct combination will score higher next time. + the annotated examples of the input. For each combination of a mention in + text and a potential KB identifier, the model makes a **prediction** whether + or not this is the correct match. It then consults the annotations to see + whether it was right. If it was wrong, it adjusts its weights so that the + correct combination will score higher next time. 3. **Save** the trained model using [`nlp.to_disk`](/api/language#to_disk). 4. **Test** the model to make sure the entities in the training data are recognized correctly. From a31e9e1cd5176651942b32a7db5d00186aad97dc Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 12 Sep 2019 15:32:39 +0200 Subject: [PATCH 146/207] Update training docs [ci skip] --- website/docs/usage/training.md | 141 +++++++++++++++++---------------- 1 file changed, 71 insertions(+), 70 deletions(-) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index e3386a64f..9489615bc 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -6,6 +6,7 @@ menu: - ['NER', 'ner'] - ['Tagger & Parser', 'tagger-parser'] - ['Text Classification', 'textcat'] + - ['Entity Linking', 'entity-linker'] - ['Tips and Advice', 'tips'] --- @@ -415,76 +416,6 @@ referred to as the "catastrophic forgetting" problem. 4. **Save** the trained model using [`nlp.to_disk`](/api/language#to_disk). 5. **Test** the model to make sure the new entity is recognized correctly. -## Entity linking {#entity-linker} - -To train an entity linking model, you first need to define a knowledge base -(KB). - -### Creating a knowledge base {#kb} - -A KB consists of a list of entities with unique identifiers. Each such entity -has an entity vector that will be used to measure similarity with the context in -which an entity is used. These vectors are pretrained and stored in the KB -before the entity linking model will be trained. - -The following example shows how to build a knowledge base from scratch, given a -list of entities and potential aliases. The script further demonstrates how to -pretrain and store the entity vectors. To run this example, the script needs -access to a `vocab` instance or an `nlp` model with pretrained word embeddings. - -```python -https://github.com/explosion/spaCy/tree/master/examples/training/pretrain_kb.py -``` - -#### Step by step guide {#step-by-step-kb} - -1. **Load the model** you want to start with, or create an **empty model** using - [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language and - a pre-defined [`vocab`](/api/vocab) object. -2. **Pretrain the entity embeddings** by running the descriptions of the - entities through a simple encoder-decoder network. The current implementation - requires the `nlp` model to have access to pre-trained word embeddings, but a - custom implementation of this enoding step can also be used. -3. **Construct the KB** by defining all entities with their pretrained vectors, - and all aliases with their prior probabilities. -4. **Save** the KB using [`kb.dump`](/api/kb#dump). -5. **Test** the KB to make sure the entities were added correctly. - -### Training an entity linking model {#entity-linker-model} - -This example shows how to create an entity linker pipe using a previously -created knowledge base. The entity linker pipe is then trained with your own -examples. To do so, you'll need to provide **example texts**, and the -**character offsets** and **knowledge base identifiers** of each entity -contained in the texts. - -```python -https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_linker.py -``` - -#### Step by step guide {#step-by-step-entity-linker} - -1. **Load the KB** you want to start with, and specify the path to the `Vocab` - object that was used to create this KB. Then, create an **empty model** using - [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language. - Don't forget to add the KB to the entity linker, and to add the entity linker - to the pipeline. In practical applications, you will want a more advanced - pipeline including also a component for - [named entity recognition](/usage/training#ner). If you're using a model with - additional components, make sure to disable all other pipeline components - during training using [`nlp.disable_pipes`](/api/language#disable_pipes). - This way, you'll only be training the entity linker. -2. **Shuffle and loop over** the examples. For each example, **update the - model** by calling [`nlp.update`](/api/language#update), which steps through - the annotated examples of the input. For each combination of a mention in - text and a potential KB identifier, the model makes a **prediction** whether - or not this is the correct match. It then consults the annotations to see - whether it was right. If it was wrong, it adjusts its weights so that the - correct combination will score higher next time. -3. **Save** the trained model using [`nlp.to_disk`](/api/language#to_disk). -4. **Test** the model to make sure the entities in the training data are - recognized correctly. - ## Training the tagger and parser {#tagger-parser} ### Updating the Dependency Parser {#example-train-parser} @@ -665,6 +596,76 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_textcat.p 7. **Save** the trained model using [`nlp.to_disk`](/api/language#to_disk). 8. **Test** the model to make sure the text classifier works as expected. +## Entity linking {#entity-linker} + +To train an entity linking model, you first need to define a knowledge base +(KB). + +### Creating a knowledge base {#kb} + +A KB consists of a list of entities with unique identifiers. Each such entity +has an entity vector that will be used to measure similarity with the context in +which an entity is used. These vectors are pretrained and stored in the KB +before the entity linking model will be trained. + +The following example shows how to build a knowledge base from scratch, given a +list of entities and potential aliases. The script further demonstrates how to +pretrain and store the entity vectors. To run this example, the script needs +access to a `vocab` instance or an `nlp` model with pretrained word embeddings. + +```python +https://github.com/explosion/spaCy/tree/master/examples/training/pretrain_kb.py +``` + +#### Step by step guide {#step-by-step-kb} + +1. **Load the model** you want to start with, or create an **empty model** using + [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language and + a pre-defined [`vocab`](/api/vocab) object. +2. **Pretrain the entity embeddings** by running the descriptions of the + entities through a simple encoder-decoder network. The current implementation + requires the `nlp` model to have access to pre-trained word embeddings, but a + custom implementation of this enoding step can also be used. +3. **Construct the KB** by defining all entities with their pretrained vectors, + and all aliases with their prior probabilities. +4. **Save** the KB using [`kb.dump`](/api/kb#dump). +5. **Test** the KB to make sure the entities were added correctly. + +### Training an entity linking model {#entity-linker-model} + +This example shows how to create an entity linker pipe using a previously +created knowledge base. The entity linker pipe is then trained with your own +examples. To do so, you'll need to provide **example texts**, and the +**character offsets** and **knowledge base identifiers** of each entity +contained in the texts. + +```python +https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_linker.py +``` + +#### Step by step guide {#step-by-step-entity-linker} + +1. **Load the KB** you want to start with, and specify the path to the `Vocab` + object that was used to create this KB. Then, create an **empty model** using + [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language. + Don't forget to add the KB to the entity linker, and to add the entity linker + to the pipeline. In practical applications, you will want a more advanced + pipeline including also a component for + [named entity recognition](/usage/training#ner). If you're using a model with + additional components, make sure to disable all other pipeline components + during training using [`nlp.disable_pipes`](/api/language#disable_pipes). + This way, you'll only be training the entity linker. +2. **Shuffle and loop over** the examples. For each example, **update the + model** by calling [`nlp.update`](/api/language#update), which steps through + the annotated examples of the input. For each combination of a mention in + text and a potential KB identifier, the model makes a **prediction** whether + or not this is the correct match. It then consults the annotations to see + whether it was right. If it was wrong, it adjusts its weights so that the + correct combination will score higher next time. +3. **Save** the trained model using [`nlp.to_disk`](/api/language#to_disk). +4. **Test** the model to make sure the entities in the training data are + recognized correctly. + ## Optimization tips and advice {#tips} There are lots of conflicting "recipes" for training deep neural networks at the From 38037d68169e07ffd0510818320da4cefb8dd3cc Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 12 Sep 2019 15:33:39 +0200 Subject: [PATCH 147/207] Update landing [ci skip] --- website/src/widgets/landing.js | 36 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js index e9dec87f4..91fd756fa 100644 --- a/website/src/widgets/landing.js +++ b/website/src/widgets/landing.js @@ -150,6 +150,24 @@ const Landing = ({ data }) => { </LandingGrid> <LandingBannerGrid> + <LandingBanner + title="Prodigy: Radically efficient machine teaching" + label="From the makers of spaCy" + to="https://prodi.gy" + button="Try it out" + background="#eee" + color="#252a33" + small + > + Prodigy is an <strong>annotation tool</strong> so efficient that data scientists + can do the annotation themselves, enabling a new level of rapid iteration. + Whether you're working on entity recognition, intent detection or image + classification, Prodigy can help you <strong>train and evaluate</strong> your + models faster. Stream in your own examples or real-world data from live APIs, + update your model in real-time and chain models together to build more complex + systems. + </LandingBanner> + <LandingBanner title="spaCy IRL 2019: Two days of NLP" label="Watch the videos" @@ -168,24 +186,6 @@ const Landing = ({ data }) => { research, development and applications, with keynotes by Sebastian Ruder (DeepMind) and Yoav Goldberg (Allen AI). </LandingBanner> - - <LandingBanner - title="Prodigy: Radically efficient machine teaching" - label="From the makers of spaCy" - to="https://prodi.gy" - button="Try it out" - background="#eee" - color="#252a33" - small - > - Prodigy is an <strong>annotation tool</strong> so efficient that data scientists - can do the annotation themselves, enabling a new level of rapid iteration. - Whether you're working on entity recognition, intent detection or image - classification, Prodigy can help you <strong>train and evaluate</strong> your - models faster. Stream in your own examples or real-world data from live APIs, - update your model in real-time and chain models together to build more complex - systems. - </LandingBanner> </LandingBannerGrid> <LandingLogos title="spaCy is trusted by" logos={data.logosUsers}> From 82c16b7943adcee47ee3ea376e32486546d66043 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 12 Sep 2019 16:11:15 +0200 Subject: [PATCH 148/207] Remove u-strings and fix formatting [ci skip] --- website/README.md | 13 +- website/docs/api/annotation.md | 206 +++++++++--------- website/docs/api/cython-classes.md | 6 +- website/docs/api/cython-structs.md | 6 +- website/docs/api/dependencyparser.md | 2 +- website/docs/api/doc.md | 79 ++++--- website/docs/api/entitylinker.md | 129 +++++------ website/docs/api/entityrecognizer.md | 12 +- website/docs/api/goldparse.md | 10 +- website/docs/api/language.md | 8 +- website/docs/api/lemmatizer.md | 8 +- website/docs/api/lexeme.md | 18 +- website/docs/api/matcher.md | 4 +- website/docs/api/phrasematcher.md | 16 +- website/docs/api/pipeline-functions.md | 8 +- website/docs/api/sentencizer.md | 2 +- website/docs/api/span.md | 74 +++---- website/docs/api/stringstore.md | 30 +-- website/docs/api/tagger.md | 13 +- website/docs/api/textcategorizer.md | 12 +- website/docs/api/token.md | 60 ++--- website/docs/api/tokenizer.md | 13 +- website/docs/api/top-level.md | 36 +-- website/docs/api/vectors.md | 22 +- website/docs/api/vocab.md | 28 +-- website/docs/usage/101/_named-entities.md | 12 +- website/docs/usage/101/_pos-deps.md | 4 +- website/docs/usage/101/_tokenization.md | 2 +- website/docs/usage/101/_vectors-similarity.md | 8 +- website/docs/usage/adding-languages.md | 8 +- website/docs/usage/index.md | 2 +- website/docs/usage/linguistic-features.md | 145 ++++++------ website/docs/usage/models.md | 6 +- website/docs/usage/processing-pipelines.md | 30 +-- website/docs/usage/rule-based-matching.md | 42 ++-- website/docs/usage/saving-loading.md | 16 +- website/docs/usage/spacy-101.md | 108 ++++----- website/docs/usage/training.md | 8 +- website/docs/usage/v2-1.md | 8 +- website/docs/usage/v2.md | 42 ++-- website/docs/usage/vectors-similarity.md | 15 +- website/docs/usage/visualizers.md | 17 +- website/meta/universe.json | 12 +- website/src/widgets/quickstart-models.js | 2 +- 44 files changed, 644 insertions(+), 658 deletions(-) diff --git a/website/README.md b/website/README.md index be817225d..a02d5a151 100644 --- a/website/README.md +++ b/website/README.md @@ -309,7 +309,7 @@ indented block as plain text and preserve whitespace. ### Using spaCy import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"This is a sentence.") +doc = nlp("This is a sentence.") for token in doc: print(token.text, token.pos_) ``` @@ -335,9 +335,9 @@ from spacy.matcher import Matcher nlp = spacy.load('en_core_web_sm') matcher = Matcher(nlp.vocab) -pattern = [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}] -matcher.add('HelloWorld', None, pattern) -doc = nlp(u'Hello, world! Hello world!') +pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}] +matcher.add("HelloWorld", None, pattern) +doc = nlp("Hello, world! Hello world!") matches = matcher(doc) ``` @@ -360,7 +360,7 @@ interactive widget defaults to a regular code block. ### {executable="true"} import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"This is a sentence.") +doc = nlp("This is a sentence.") for token in doc: print(token.text, token.pos_) ``` @@ -457,7 +457,8 @@ sit amet dignissim justo congue. ## Setup and installation {#setup} Before running the setup, make sure your versions of -[Node](https://nodejs.org/en/) and [npm](https://www.npmjs.com/) are up to date. Node v10.15 or later is required. +[Node](https://nodejs.org/en/) and [npm](https://www.npmjs.com/) are up to date. +Node v10.15 or later is required. ```bash # Clone the repository diff --git a/website/docs/api/annotation.md b/website/docs/api/annotation.md index ac888cec9..2c52d197a 100644 --- a/website/docs/api/annotation.md +++ b/website/docs/api/annotation.md @@ -16,7 +16,7 @@ menu: > ```python > from spacy.lang.en import English > nlp = English() -> tokens = nlp(u"Some\\nspaces and\\ttab characters") +> tokens = nlp("Some\\nspaces and\\ttab characters") > tokens_text = [t.text for t in tokens] > assert tokens_text == ["Some", "\\n", "spaces", " ", "and", "\\t", "tab", "characters"] > ``` @@ -186,63 +186,63 @@ The German part-of-speech tagger uses the annotation scheme. We also map the tags to the simpler Google Universal POS tag set. -| Tag | POS | Morphology | Description | -| --------- | ------- | ------------------------------------------- | ------------------------------------------------- | -| `$(` | `PUNCT` | `PunctType=brck` | other sentence-internal punctuation mark | -| `$,` | `PUNCT` | `PunctType=comm` | comma | -| `$.` | `PUNCT` | `PunctType=peri` | sentence-final punctuation mark | -| `ADJA` | `ADJ` | | adjective, attributive | -| `ADJD` | `ADJ` | `Variant=short` | adjective, adverbial or predicative | -| `ADV` | `ADV` | | adverb | -| `APPO` | `ADP` | `AdpType=post` | postposition | -| `APPR` | `ADP` | `AdpType=prep` | preposition; circumposition left | -| `APPRART` | `ADP` | `AdpType=prep PronType=art` | preposition with article | -| `APZR` | `ADP` | `AdpType=circ` | circumposition right | -| `ART` | `DET` | `PronType=art` | definite or indefinite article | -| `CARD` | `NUM` | `NumType=card` | cardinal number | -| `FM` | `X` | `Foreign=yes` | foreign language material | -| `ITJ` | `INTJ` | | interjection | -| `KOKOM` | `CONJ` | `ConjType=comp` | comparative conjunction | -| `KON` | `CONJ` | | coordinate conjunction | -| `KOUI` | `SCONJ` | | subordinate conjunction with "zu" and infinitive | -| `KOUS` | `SCONJ` | | subordinate conjunction with sentence | -| `NE` | `PROPN` | | proper noun | -| `NNE` | `PROPN` | | proper noun | -| `NN` | `NOUN` | | noun, singular or mass | -| `PROAV` | `ADV` | `PronType=dem` | pronominal adverb | -| `PDAT` | `DET` | `PronType=dem` | attributive demonstrative pronoun | -| `PDS` | `PRON` | `PronType=dem` | substituting demonstrative pronoun | -| `PIAT` | `DET` | `PronType=ind\|neg\|tot` | attributive indefinite pronoun without determiner | -| `PIS` | `PRON` | `PronType=ind\|neg\|tot` | substituting indefinite pronoun | -| `PPER` | `PRON` | `PronType=prs` | non-reflexive personal pronoun | -| `PPOSAT` | `DET` | `Poss=yes PronType=prs` | attributive possessive pronoun | -| `PPOSS` | `PRON` | `PronType=rel` | substituting possessive pronoun | -| `PRELAT` | `DET` | `PronType=rel` | attributive relative pronoun | -| `PRELS` | `PRON` | `PronType=rel` | substituting relative pronoun | -| `PRF` | `PRON` | `PronType=prs Reflex=yes` | reflexive personal pronoun | -| `PTKA` | `PART` | | particle with adjective or adverb | -| `PTKANT` | `PART` | `PartType=res` | answer particle | -| `PTKNEG` | `PART` | `Negative=yes` | negative particle | -| `PTKVZ` | `PART` | `PartType=vbp` | separable verbal particle | -| `PTKZU` | `PART` | `PartType=inf` | "zu" before infinitive | -| `PWAT` | `DET` | `PronType=int` | attributive interrogative pronoun | -| `PWAV` | `ADV` | `PronType=int` | adverbial interrogative or relative pronoun | -| `PWS` | `PRON` | `PronType=int` | substituting interrogative pronoun | -| `TRUNC` | `X` | `Hyph=yes` | word remnant | -| `VAFIN` | `AUX` | `Mood=ind VerbForm=fin` | finite verb, auxiliary | -| `VAIMP` | `AUX` | `Mood=imp VerbForm=fin` | imperative, auxiliary | -| `VAINF` | `AUX` | `VerbForm=inf` | infinitive, auxiliary | -| `VAPP` | `AUX` | `Aspect=perf VerbForm=fin` | perfect participle, auxiliary | -| `VMFIN` | `VERB` | `Mood=ind VerbForm=fin VerbType=mod` | finite verb, modal | -| `VMINF` | `VERB` | `VerbForm=fin VerbType=mod` | infinitive, modal | -| `VMPP` | `VERB` | `Aspect=perf VerbForm=part VerbType=mod` | perfect participle, modal | -| `VVFIN` | `VERB` | `Mood=ind VerbForm=fin` | finite verb, full | -| `VVIMP` | `VERB` | `Mood=imp VerbForm=fin` | imperative, full | -| `VVINF` | `VERB` | `VerbForm=inf` | infinitive, full | -| `VVIZU` | `VERB` | `VerbForm=inf` | infinitive with "zu", full | -| `VVPP` | `VERB` | `Aspect=perf VerbForm=part` | perfect participle, full | -| `XY` | `X` | | non-word containing non-letter | -| `SP` | `SPACE` | | space | +| Tag | POS | Morphology | Description | +| --------- | ------- | ---------------------------------------- | ------------------------------------------------- | +| `$(` | `PUNCT` | `PunctType=brck` | other sentence-internal punctuation mark | +| `$,` | `PUNCT` | `PunctType=comm` | comma | +| `$.` | `PUNCT` | `PunctType=peri` | sentence-final punctuation mark | +| `ADJA` | `ADJ` | | adjective, attributive | +| `ADJD` | `ADJ` | `Variant=short` | adjective, adverbial or predicative | +| `ADV` | `ADV` | | adverb | +| `APPO` | `ADP` | `AdpType=post` | postposition | +| `APPR` | `ADP` | `AdpType=prep` | preposition; circumposition left | +| `APPRART` | `ADP` | `AdpType=prep PronType=art` | preposition with article | +| `APZR` | `ADP` | `AdpType=circ` | circumposition right | +| `ART` | `DET` | `PronType=art` | definite or indefinite article | +| `CARD` | `NUM` | `NumType=card` | cardinal number | +| `FM` | `X` | `Foreign=yes` | foreign language material | +| `ITJ` | `INTJ` | | interjection | +| `KOKOM` | `CONJ` | `ConjType=comp` | comparative conjunction | +| `KON` | `CONJ` | | coordinate conjunction | +| `KOUI` | `SCONJ` | | subordinate conjunction with "zu" and infinitive | +| `KOUS` | `SCONJ` | | subordinate conjunction with sentence | +| `NE` | `PROPN` | | proper noun | +| `NNE` | `PROPN` | | proper noun | +| `NN` | `NOUN` | | noun, singular or mass | +| `PROAV` | `ADV` | `PronType=dem` | pronominal adverb | +| `PDAT` | `DET` | `PronType=dem` | attributive demonstrative pronoun | +| `PDS` | `PRON` | `PronType=dem` | substituting demonstrative pronoun | +| `PIAT` | `DET` | `PronType=ind\|neg\|tot` | attributive indefinite pronoun without determiner | +| `PIS` | `PRON` | `PronType=ind\|neg\|tot` | substituting indefinite pronoun | +| `PPER` | `PRON` | `PronType=prs` | non-reflexive personal pronoun | +| `PPOSAT` | `DET` | `Poss=yes PronType=prs` | attributive possessive pronoun | +| `PPOSS` | `PRON` | `PronType=rel` | substituting possessive pronoun | +| `PRELAT` | `DET` | `PronType=rel` | attributive relative pronoun | +| `PRELS` | `PRON` | `PronType=rel` | substituting relative pronoun | +| `PRF` | `PRON` | `PronType=prs Reflex=yes` | reflexive personal pronoun | +| `PTKA` | `PART` | | particle with adjective or adverb | +| `PTKANT` | `PART` | `PartType=res` | answer particle | +| `PTKNEG` | `PART` | `Negative=yes` | negative particle | +| `PTKVZ` | `PART` | `PartType=vbp` | separable verbal particle | +| `PTKZU` | `PART` | `PartType=inf` | "zu" before infinitive | +| `PWAT` | `DET` | `PronType=int` | attributive interrogative pronoun | +| `PWAV` | `ADV` | `PronType=int` | adverbial interrogative or relative pronoun | +| `PWS` | `PRON` | `PronType=int` | substituting interrogative pronoun | +| `TRUNC` | `X` | `Hyph=yes` | word remnant | +| `VAFIN` | `AUX` | `Mood=ind VerbForm=fin` | finite verb, auxiliary | +| `VAIMP` | `AUX` | `Mood=imp VerbForm=fin` | imperative, auxiliary | +| `VAINF` | `AUX` | `VerbForm=inf` | infinitive, auxiliary | +| `VAPP` | `AUX` | `Aspect=perf VerbForm=fin` | perfect participle, auxiliary | +| `VMFIN` | `VERB` | `Mood=ind VerbForm=fin VerbType=mod` | finite verb, modal | +| `VMINF` | `VERB` | `VerbForm=fin VerbType=mod` | infinitive, modal | +| `VMPP` | `VERB` | `Aspect=perf VerbForm=part VerbType=mod` | perfect participle, modal | +| `VVFIN` | `VERB` | `Mood=ind VerbForm=fin` | finite verb, full | +| `VVIMP` | `VERB` | `Mood=imp VerbForm=fin` | imperative, full | +| `VVINF` | `VERB` | `VerbForm=inf` | infinitive, full | +| `VVIZU` | `VERB` | `VerbForm=inf` | infinitive with "zu", full | +| `VVPP` | `VERB` | `Aspect=perf VerbForm=part` | perfect participle, full | +| `XY` | `X` | | non-word containing non-letter | +| `SP` | `SPACE` | | space | </Accordion> @@ -379,51 +379,51 @@ The German dependency labels use the [TIGER Treebank](http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html) annotation scheme. -| Label | Description | -| ------ | ------------------------------- | -| `ac` | adpositional case marker | -| `adc` | adjective component | -| `ag` | genitive attribute | -| `ams` | measure argument of adjective | -| `app` | apposition | -| `avc` | adverbial phrase component | -| `cc` | comparative complement | -| `cd` | coordinating conjunction | -| `cj` | conjunct | -| `cm` | comparative conjunction | -| `cp` | complementizer | -| `cvc` | collocational verb construction | -| `da` | dative | -| `dm` | discourse marker | -| `ep` | expletive es | -| `ju` | junctor | -| `mnr` | postnominal modifier | -| `mo` | modifier | -| `ng` | negation | -| `nk` | noun kernel element | -| `nmc` | numerical component | -| `oa` | accusative object | -| `oa2` | second accusative object | -| `oc` | clausal object | -| `og` | genitive object | -| `op` | prepositional object | -| `par` | parenthetical element | -| `pd` | predicate | -| `pg` | phrasal genitive | -| `ph` | placeholder | -| `pm` | morphological particle | -| `pnc` | proper noun component | -| `punct` | punctuation | -| `rc` | relative clause | -| `re` | repeated element | -| `rs` | reported speech | -| `sb` | subject | -| `sbp` | passivized subject (PP) | -| `sp` | subject or predicate | -| `svp` | separable verb prefix | -| `uc` | unit component | -| `vo` | vocative | -| `ROOT` | root | +| Label | Description | +| ------- | ------------------------------- | +| `ac` | adpositional case marker | +| `adc` | adjective component | +| `ag` | genitive attribute | +| `ams` | measure argument of adjective | +| `app` | apposition | +| `avc` | adverbial phrase component | +| `cc` | comparative complement | +| `cd` | coordinating conjunction | +| `cj` | conjunct | +| `cm` | comparative conjunction | +| `cp` | complementizer | +| `cvc` | collocational verb construction | +| `da` | dative | +| `dm` | discourse marker | +| `ep` | expletive es | +| `ju` | junctor | +| `mnr` | postnominal modifier | +| `mo` | modifier | +| `ng` | negation | +| `nk` | noun kernel element | +| `nmc` | numerical component | +| `oa` | accusative object | +| `oa2` | second accusative object | +| `oc` | clausal object | +| `og` | genitive object | +| `op` | prepositional object | +| `par` | parenthetical element | +| `pd` | predicate | +| `pg` | phrasal genitive | +| `ph` | placeholder | +| `pm` | morphological particle | +| `pnc` | proper noun component | +| `punct` | punctuation | +| `rc` | relative clause | +| `re` | repeated element | +| `rs` | reported speech | +| `sb` | subject | +| `sbp` | passivized subject (PP) | +| `sp` | subject or predicate | +| `svp` | separable verb prefix | +| `uc` | unit component | +| `vo` | vocative | +| `ROOT` | root | </Accordion> diff --git a/website/docs/api/cython-classes.md b/website/docs/api/cython-classes.md index 4d188d90f..77d6fdd10 100644 --- a/website/docs/api/cython-classes.md +++ b/website/docs/api/cython-classes.md @@ -45,9 +45,9 @@ Append a token to the `Doc`. The token can be provided as a > from spacy.vocab cimport Vocab > > doc = Doc(Vocab()) -> lexeme = doc.vocab.get(u'hello') +> lexeme = doc.vocab.get("hello") > doc.push_back(lexeme, True) -> assert doc.text == u'hello ' +> assert doc.text == "hello " > ``` | Name | Type | Description | @@ -164,7 +164,7 @@ vocabulary. > #### Example > > ```python -> lexeme = vocab.get(vocab.mem, u'hello') +> lexeme = vocab.get(vocab.mem, "hello") > ``` | Name | Type | Description | diff --git a/website/docs/api/cython-structs.md b/website/docs/api/cython-structs.md index 0e427a8d5..935bce25d 100644 --- a/website/docs/api/cython-structs.md +++ b/website/docs/api/cython-structs.md @@ -88,7 +88,7 @@ Find a token in a `TokenC*` array by the offset of its first character. > from spacy.tokens.doc cimport Doc, token_by_start > from spacy.vocab cimport Vocab > -> doc = Doc(Vocab(), words=[u'hello', u'world']) +> doc = Doc(Vocab(), words=["hello", "world"]) > assert token_by_start(doc.c, doc.length, 6) == 1 > assert token_by_start(doc.c, doc.length, 4) == -1 > ``` @@ -110,7 +110,7 @@ Find a token in a `TokenC*` array by the offset of its final character. > from spacy.tokens.doc cimport Doc, token_by_end > from spacy.vocab cimport Vocab > -> doc = Doc(Vocab(), words=[u'hello', u'world']) +> doc = Doc(Vocab(), words=["hello", "world"]) > assert token_by_end(doc.c, doc.length, 5) == 0 > assert token_by_end(doc.c, doc.length, 1) == -1 > ``` @@ -134,7 +134,7 @@ attribute, in order to make the parse tree navigation consistent. > from spacy.tokens.doc cimport Doc, set_children_from_heads > from spacy.vocab cimport Vocab > -> doc = Doc(Vocab(), words=[u'Baileys', u'from', u'a', u'shoe']) +> doc = Doc(Vocab(), words=["Baileys", "from", "a", "shoe"]) > doc.c[0].head = 0 > doc.c[1].head = 0 > doc.c[2].head = 3 diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index 58acc4425..df0df3e38 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -58,7 +58,7 @@ and all pipeline components are applied to the `Doc` in order. Both > > ```python > parser = DependencyParser(nlp.vocab) -> doc = nlp(u"This is a sentence.") +> doc = nlp("This is a sentence.") > # This usually happens under the hood > processed = parser(doc) > ``` diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 431d3a092..ad684f51e 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -20,11 +20,11 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the > > ```python > # Construction 1 -> doc = nlp(u"Some text") +> doc = nlp("Some text") > > # Construction 2 > from spacy.tokens import Doc -> words = [u"hello", u"world", u"!"] +> words = ["hello", "world", "!"] > spaces = [True, False, False] > doc = Doc(nlp.vocab, words=words, spaces=spaces) > ``` @@ -45,7 +45,7 @@ Negative indexing is supported, and follows the usual Python semantics, i.e. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > assert doc[0].text == "Give" > assert doc[-1].text == "." > span = doc[1:3] @@ -76,8 +76,8 @@ Iterate over `Token` objects, from which the annotations can be easily accessed. > #### Example > > ```python -> doc = nlp(u'Give it back') -> assert [t.text for t in doc] == [u'Give', u'it', u'back'] +> doc = nlp("Give it back") +> assert [t.text for t in doc] == ["Give", "it", "back"] > ``` This is the main way of accessing [`Token`](/api/token) objects, which are the @@ -96,7 +96,7 @@ Get the number of tokens in the document. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > assert len(doc) == 7 > ``` @@ -114,9 +114,9 @@ details, see the documentation on > > ```python > from spacy.tokens import Doc -> city_getter = lambda doc: any(city in doc.text for city in ('New York', 'Paris', 'Berlin')) -> Doc.set_extension('has_city', getter=city_getter) -> doc = nlp(u'I like New York') +> city_getter = lambda doc: any(city in doc.text for city in ("New York", "Paris", "Berlin")) +> Doc.set_extension("has_city", getter=city_getter) +> doc = nlp("I like New York") > assert doc._.has_city > ``` @@ -192,8 +192,8 @@ the character indices don't map to a valid span. > #### Example > > ```python -> doc = nlp(u"I like New York") -> span = doc.char_span(7, 15, label=u"GPE") +> doc = nlp("I like New York") +> span = doc.char_span(7, 15, label="GPE") > assert span.text == "New York" > ``` @@ -213,8 +213,8 @@ using an average of word vectors. > #### Example > > ```python -> apples = nlp(u"I like apples") -> oranges = nlp(u"I like oranges") +> apples = nlp("I like apples") +> oranges = nlp("I like oranges") > apples_oranges = apples.similarity(oranges) > oranges_apples = oranges.similarity(apples) > assert apples_oranges == oranges_apples @@ -235,7 +235,7 @@ attribute ID. > > ```python > from spacy.attrs import ORTH -> doc = nlp(u"apple apple orange banana") +> doc = nlp("apple apple orange banana") > assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2} > doc.to_array([ORTH]) > # array([[11880], [11880], [7561], [12800]]) @@ -255,7 +255,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor. > #### Example > > ```python -> doc = nlp(u"This is a test") +> doc = nlp("This is a test") > matrix = doc.get_lca_matrix() > # array([[0, 1, 1, 1], [1, 1, 1, 1], [1, 1, 2, 3], [1, 1, 3, 3]], dtype=int32) > ``` @@ -274,7 +274,7 @@ They'll be added to an `"_"` key in the data, e.g. `"_": {"foo": "bar"}`. > #### Example > > ```python -> doc = nlp(u"Hello") +> doc = nlp("Hello") > json_doc = doc.to_json() > ``` > @@ -342,7 +342,7 @@ array of attributes. > ```python > from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA > from spacy.tokens import Doc -> doc = nlp(u"Hello world!") +> doc = nlp("Hello world!") > np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) > doc2 = Doc(doc.vocab, words=[t.text for t in doc]) > doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array) @@ -396,7 +396,7 @@ Serialize, i.e. export the document contents to a binary string. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > doc_bytes = doc.to_bytes() > ``` @@ -413,10 +413,9 @@ Deserialize, i.e. import the document contents from a binary string. > > ```python > from spacy.tokens import Doc -> text = u"Give it back! He pleaded." -> doc = nlp(text) -> bytes = doc.to_bytes() -> doc2 = Doc(doc.vocab).from_bytes(bytes) +> doc = nlp("Give it back! He pleaded.") +> doc_bytes = doc.to_bytes() +> doc2 = Doc(doc.vocab).from_bytes(doc_bytes) > assert doc.text == doc2.text > ``` @@ -457,9 +456,9 @@ dictionary mapping attribute names to values as the `"_"` key. > #### Example > > ```python -> doc = nlp(u"I like David Bowie") +> doc = nlp("I like David Bowie") > with doc.retokenize() as retokenizer: -> attrs = {"LEMMA": u"David Bowie"} +> attrs = {"LEMMA": "David Bowie"} > retokenizer.merge(doc[2:4], attrs=attrs) > ``` @@ -489,7 +488,7 @@ underlying lexeme (if they're context-independent lexical attributes like > #### Example > > ```python -> doc = nlp(u"I live in NewYork") +> doc = nlp("I live in NewYork") > with doc.retokenize() as retokenizer: > heads = [(doc[3], 1), doc[2]] > attrs = {"POS": ["PROPN", "PROPN"], @@ -521,9 +520,9 @@ and end token boundaries, the document remains unchanged. > #### Example > > ```python -> doc = nlp(u"Los Angeles start.") +> doc = nlp("Los Angeles start.") > doc.merge(0, len("Los Angeles"), "NNP", "Los Angeles", "GPE") -> assert [t.text for t in doc] == [u"Los Angeles", u"start", u"."] +> assert [t.text for t in doc] == ["Los Angeles", "start", "."] > ``` | Name | Type | Description | @@ -541,11 +540,11 @@ objects, if the entity recognizer has been applied. > #### Example > > ```python -> doc = nlp(u"Mr. Best flew to New York on Saturday morning.") +> doc = nlp("Mr. Best flew to New York on Saturday morning.") > ents = list(doc.ents) > assert ents[0].label == 346 -> assert ents[0].label_ == u"PERSON" -> assert ents[0].text == u"Mr. Best" +> assert ents[0].label_ == "PERSON" +> assert ents[0].text == "Mr. Best" > ``` | Name | Type | Description | @@ -563,10 +562,10 @@ relative clauses. > #### Example > > ```python -> doc = nlp(u"A phrase with another phrase occurs.") +> doc = nlp("A phrase with another phrase occurs.") > chunks = list(doc.noun_chunks) -> assert chunks[0].text == u"A phrase" -> assert chunks[1].text == u"another phrase" +> assert chunks[0].text == "A phrase" +> assert chunks[1].text == "another phrase" > ``` | Name | Type | Description | @@ -583,10 +582,10 @@ will be unavailable. > #### Example > > ```python -> doc = nlp(u"This is a sentence. Here's another...") +> doc = nlp("This is a sentence. Here's another...") > sents = list(doc.sents) > assert len(sents) == 2 -> assert [s.root.text for s in sents] == [u"is", u"'s"] +> assert [s.root.text for s in sents] == ["is", "'s"] > ``` | Name | Type | Description | @@ -600,7 +599,7 @@ A boolean value indicating whether a word vector is associated with the object. > #### Example > > ```python -> doc = nlp(u"I like apples") +> doc = nlp("I like apples") > assert doc.has_vector > ``` @@ -616,8 +615,8 @@ vectors. > #### Example > > ```python -> doc = nlp(u"I like apples") -> assert doc.vector.dtype == 'float32' +> doc = nlp("I like apples") +> assert doc.vector.dtype == "float32" > assert doc.vector.shape == (300,) > ``` @@ -632,8 +631,8 @@ The L2 norm of the document's vector representation. > #### Example > > ```python -> doc1 = nlp(u"I like apples") -> doc2 = nlp(u"I like oranges") +> doc1 = nlp("I like apples") +> doc2 = nlp("I like oranges") > doc1.vector_norm # 4.54232424414368 > doc2.vector_norm # 3.304373298575751 > assert doc1.vector_norm != doc2.vector_norm diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 64db50943..88131761f 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -1,6 +1,8 @@ --- title: EntityLinker -teaser: Functionality to disambiguate a named entity in text to a unique knowledge base identifier. +teaser: + Functionality to disambiguate a named entity in text to a unique knowledge + base identifier. tag: class source: spacy/pipeline/pipes.pyx new: 2.2 @@ -13,9 +15,9 @@ via the ID `"entity_linker"`. ## EntityLinker.Model {#model tag="classmethod"} Initialize a model for the pipe. The model should implement the -`thinc.neural.Model` API, and should contain a field `tok2vec` that contains -the context encoder. Wrappers are under development for most major machine -learning libraries. +`thinc.neural.Model` API, and should contain a field `tok2vec` that contains the +context encoder. Wrappers are under development for most major machine learning +libraries. | Name | Type | Description | | ----------- | ------ | ------------------------------------- | @@ -40,30 +42,29 @@ shortcut for this and instantiate the component using its string name and > entity_linker.from_disk("/path/to/model") > ``` -| Name | Type | Description | -| --------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. | -| `hidden_width` | int | Width of the hidden layer of the entity linking model, defaults to 128. | -| `incl_prior` | bool | Whether or not to include prior probabilities in the model. Defaults to True. | -| `incl_context` | bool | Whether or not to include the local context in the model (if not: only prior probabilites are used). Defaults to True. | -| **RETURNS** | `EntityLinker` | The newly constructed object. | +| Name | Type | Description | +| -------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared vocabulary. | +| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. | +| `hidden_width` | int | Width of the hidden layer of the entity linking model, defaults to 128. | +| `incl_prior` | bool | Whether or not to include prior probabilities in the model. Defaults to True. | +| `incl_context` | bool | Whether or not to include the local context in the model (if not: only prior probabilites are used). Defaults to True. | +| **RETURNS** | `EntityLinker` | The newly constructed object. | ## EntityLinker.\_\_call\_\_ {#call tag="method"} Apply the pipe to one document. The document is modified in place, and returned. This usually happens under the hood when the `nlp` object is called on a text and all pipeline components are applied to the `Doc` in order. Both -[`__call__`](/api/entitylinker#call) and -[`pipe`](/api/entitylinker#pipe) delegate to the -[`predict`](/api/entitylinker#predict) and -[`set_annotations`](/api/entitylinker#set_annotations) methods. +[`__call__`](/api/entitylinker#call) and [`pipe`](/api/entitylinker#pipe) +delegate to the [`predict`](/api/entitylinker#predict) and +[`set_annotations`](/api/entitylinker#set_annotations) methods. > #### Example > > ```python > entity_linker = EntityLinker(nlp.vocab) -> doc = nlp(u"This is a sentence.") +> doc = nlp("This is a sentence.") > # This usually happens under the hood > processed = entity_linker(doc) > ``` @@ -107,14 +108,15 @@ Apply the pipeline's model to a batch of docs, without modifying them. > kb_ids, tensors = entity_linker.predict([doc1, doc2]) > ``` -| Name | Type | Description | -| ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `docs` | iterable | The documents to predict. | -| **RETURNS** | tuple | A `(kb_ids, tensors)` tuple where `kb_ids` are the model's predicted KB identifiers for the entities in the `docs`, and `tensors` are the token representations used to predict these identifiers. | +| Name | Type | Description | +| ----------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `docs` | iterable | The documents to predict. | +| **RETURNS** | tuple | A `(kb_ids, tensors)` tuple where `kb_ids` are the model's predicted KB identifiers for the entities in the `docs`, and `tensors` are the token representations used to predict these identifiers. | ## EntityLinker.set_annotations {#set_annotations tag="method"} -Modify a batch of documents, using pre-computed entity IDs for a list of named entities. +Modify a batch of documents, using pre-computed entity IDs for a list of named +entities. > #### Example > @@ -124,16 +126,17 @@ Modify a batch of documents, using pre-computed entity IDs for a list of named e > entity_linker.set_annotations([doc1, doc2], kb_ids, tensors) > ``` -| Name | Type | Description | -| ---------- | -------- | --------------------------------------------------------------------------------------------------- | -| `docs` | iterable | The documents to modify. | -| `kb_ids` | iterable | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. | -| `tensors` | iterable | The token representations used to predict the identifiers. | +| Name | Type | Description | +| --------- | -------- | ------------------------------------------------------------------------------------------------- | +| `docs` | iterable | The documents to modify. | +| `kb_ids` | iterable | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. | +| `tensors` | iterable | The token representations used to predict the identifiers. | ## EntityLinker.update {#update tag="method"} Learn from a batch of documents and gold-standard information, updating both the -pipe's entity linking model and context encoder. Delegates to [`predict`](/api/entitylinker#predict) and +pipe's entity linking model and context encoder. Delegates to +[`predict`](/api/entitylinker#predict) and [`get_loss`](/api/entitylinker#get_loss). > #### Example @@ -145,18 +148,18 @@ pipe's entity linking model and context encoder. Delegates to [`predict`](/api/e > entity_linker.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer) > ``` -| Name | Type | Description | -| -------- | -------- | ------------------------------------------------------------------------------------------------------------- | -| `docs` | iterable | A batch of documents to learn from. | -| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | -| `drop` | float | The dropout rate, used both for the EL model and the context encoder. | -| `sgd` | callable | The optimizer for the EL model. Should take two arguments `weights` and `gradient`, and an optional ID. | -| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. | +| Name | Type | Description | +| -------- | -------- | ------------------------------------------------------------------------------------------------------- | +| `docs` | iterable | A batch of documents to learn from. | +| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | +| `drop` | float | The dropout rate, used both for the EL model and the context encoder. | +| `sgd` | callable | The optimizer for the EL model. Should take two arguments `weights` and `gradient`, and an optional ID. | +| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. | ## EntityLinker.get_loss {#get_loss tag="method"} -Find the loss and gradient of loss for the entities in a batch of documents and their -predicted scores. +Find the loss and gradient of loss for the entities in a batch of documents and +their predicted scores. > #### Example > @@ -166,17 +169,18 @@ predicted scores. > loss, d_loss = entity_linker.get_loss(docs, [gold1, gold2], kb_ids, tensors) > ``` -| Name | Type | Description | -| --------------- | -------- | ------------------------------------------------------------ | -| `docs` | iterable | The batch of documents. | -| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | -| `kb_ids` | iterable | KB identifiers representing the model's predictions. | -| `tensors` | iterable | The token representations used to predict the identifiers | -| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. | +| Name | Type | Description | +| ----------- | -------- | ------------------------------------------------------------ | +| `docs` | iterable | The batch of documents. | +| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | +| `kb_ids` | iterable | KB identifiers representing the model's predictions. | +| `tensors` | iterable | The token representations used to predict the identifiers | +| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. | ## EntityLinker.set_kb {#set_kb tag="method"} -Define the knowledge base (KB) used for disambiguating named entities to KB identifiers. +Define the knowledge base (KB) used for disambiguating named entities to KB +identifiers. > #### Example > @@ -185,15 +189,16 @@ Define the knowledge base (KB) used for disambiguating named entities to KB iden > entity_linker.set_kb(kb) > ``` -| Name | Type | Description | -| --------------- | --------------- | ------------------------------------------------------------ | -| `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb). | +| Name | Type | Description | +| ---- | --------------- | ------------------------------- | +| `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb). | ## EntityLinker.begin_training {#begin_training tag="method"} Initialize the pipe for training, using data examples if available. If no model -has been initialized yet, the model is added. -Before calling this method, a knowledge base should have been defined with [`set_kb`](/api/entitylinker#set_kb). +has been initialized yet, the model is added. Before calling this method, a +knowledge base should have been defined with +[`set_kb`](/api/entitylinker#set_kb). > #### Example > @@ -204,12 +209,12 @@ Before calling this method, a knowledge base should have been defined with [`set > optimizer = entity_linker.begin_training(pipeline=nlp.pipeline) > ``` -| Name | Type | Description | -| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. | -| `pipeline` | list | Optional list of pipeline components that this component is part of. | -| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityLinker`](/api/entitylinker#create_optimizer) if not set. | -| **RETURNS** | callable | An optimizer. | +| Name | Type | Description | +| ------------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. | +| `pipeline` | list | Optional list of pipeline components that this component is part of. | +| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityLinker`](/api/entitylinker#create_optimizer) if not set. | +| **RETURNS** | callable | An optimizer. | ## EntityLinker.create_optimizer {#create_optimizer tag="method"} @@ -242,7 +247,6 @@ Modify the pipe's EL model, to use the given parameter values. | -------- | ---- | ---------------------------------------------------------------------------------------------------------- | | `params` | dict | The parameter values to use in the model. At the end of the context, the original parameters are restored. | - ## EntityLinker.to_disk {#to_disk tag="method"} Serialize the pipe to disk. @@ -270,11 +274,11 @@ Load the pipe from disk. Modifies the object in place and returns it. > entity_linker.from_disk("/path/to/entity_linker") > ``` -| Name | Type | Description | -| ----------- | ------------------ | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | +| Name | Type | Description | +| ----------- | ---------------- | -------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | ## Serialization fields {#serialization-fields} @@ -294,4 +298,3 @@ serialization by passing in the string names via the `exclude` argument. | `cfg` | The config file. You usually don't want to exclude this. | | `model` | The binary model data. You usually don't want to exclude this. | | `kb` | The knowledge base. You usually don't want to exclude this. | - diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 46e8b44ee..9a2766c07 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -58,7 +58,7 @@ and all pipeline components are applied to the `Doc` in order. Both > > ```python > ner = EntityRecognizer(nlp.vocab) -> doc = nlp(u"This is a sentence.") +> doc = nlp("This is a sentence.") > # This usually happens under the hood > processed = ner(doc) > ``` @@ -119,11 +119,11 @@ Modify a batch of documents, using pre-computed scores. > ner.set_annotations([doc1, doc2], scores, tensors) > ``` -| Name | Type | Description | -| -------- | -------- | ---------------------------------------------------------- | -| `docs` | iterable | The documents to modify. | -| `scores` | - | The scores to set, produced by `EntityRecognizer.predict`. | -| `tensors`| iterable | The token representations used to predict the scores. | +| Name | Type | Description | +| --------- | -------- | ---------------------------------------------------------- | +| `docs` | iterable | The documents to modify. | +| `scores` | - | The scores to set, produced by `EntityRecognizer.predict`. | +| `tensors` | iterable | The token representations used to predict the scores. | ## EntityRecognizer.update {#update tag="method"} diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md index db7d07795..2dd24316f 100644 --- a/website/docs/api/goldparse.md +++ b/website/docs/api/goldparse.md @@ -23,7 +23,7 @@ gradient for those labels will be zero. | `deps` | iterable | A sequence of strings, representing the syntactic relation types. | | `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. | | `cats` | dict | Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). | -| `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either 1.0 (positive) or 0.0 (negative). | +| `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either 1.0 (positive) or 0.0 (negative). | | **RETURNS** | `GoldParse` | The newly constructed object. | ## GoldParse.\_\_len\_\_ {#len tag="method"} @@ -69,7 +69,7 @@ Convert a list of Doc objects into the > ```python > from spacy.gold import docs_to_json > -> doc = nlp(u"I like London") +> doc = nlp("I like London") > json_data = docs_to_json([doc]) > ``` @@ -150,7 +150,7 @@ single-token entity. > ```python > from spacy.gold import biluo_tags_from_offsets > -> doc = nlp(u"I like London.") +> doc = nlp("I like London.") > entities = [(7, 13, "LOC")] > tags = biluo_tags_from_offsets(doc, entities) > assert tags == ["O", "O", "U-LOC", "O"] @@ -172,7 +172,7 @@ entity offsets. > ```python > from spacy.gold import offsets_from_biluo_tags > -> doc = nlp(u"I like London.") +> doc = nlp("I like London.") > tags = ["O", "O", "U-LOC", "O"] > entities = offsets_from_biluo_tags(doc, tags) > assert entities == [(7, 13, "LOC")] @@ -195,7 +195,7 @@ token-based tags, e.g. to overwrite the `doc.ents`. > ```python > from spacy.gold import spans_from_biluo_tags > -> doc = nlp(u"I like London.") +> doc = nlp("I like London.") > tags = ["O", "O", "U-LOC", "O"] > doc.ents = spans_from_biluo_tags(doc, tags) > ``` diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 9a89d01cc..254ad8fb1 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -45,7 +45,7 @@ contain arbitrary whitespace. Alignment into the original string is preserved. > #### Example > > ```python -> doc = nlp(u"An example sentence. Another sentence.") +> doc = nlp("An example sentence. Another sentence.") > assert (doc[0].text, doc[0].head.tag_) == ("An", "NN") > ``` @@ -61,8 +61,8 @@ Pipeline components to prevent from being loaded can now be added as a list to `disable`, instead of specifying one keyword argument per component. ```diff -- doc = nlp(u"I don't want parsed", parse=False) -+ doc = nlp(u"I don't want parsed", disable=["parser"]) +- doc = nlp("I don't want parsed", parse=False) ++ doc = nlp("I don't want parsed", disable=["parser"]) ``` </Infobox> @@ -86,7 +86,7 @@ multiprocessing. > #### Example > > ```python -> texts = [u"One document.", u"...", u"Lots of documents"] +> texts = ["One document.", "...", "Lots of documents"] > for doc in nlp.pipe(texts, batch_size=50): > assert doc.is_parsed > ``` diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index 7bc2691e5..fd71d16cf 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -37,8 +37,8 @@ Lemmatize a string. > from spacy.lemmatizer import Lemmatizer > from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES > lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) -> lemmas = lemmatizer(u"ducks", u"NOUN") -> assert lemmas == [u"duck"] +> lemmas = lemmatizer("ducks", "NOUN") +> assert lemmas == ["duck"] > ``` | Name | Type | Description | @@ -58,9 +58,9 @@ variable, set on the individual `Language` class. > #### Example > > ```python -> lookup = {u"going": u"go"} +> lookup = {"going": "go"} > lemmatizer = Lemmatizer(lookup=lookup) -> assert lemmatizer.lookup(u"going") == u"go" +> assert lemmatizer.lookup("going") == "go" > ``` | Name | Type | Description | diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index 018dc72d8..398b71708 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -27,7 +27,7 @@ Change the value of a boolean flag. > > ```python > COOL_FLAG = nlp.vocab.add_flag(lambda text: False) -> nlp.vocab[u'spaCy'].set_flag(COOL_FLAG, True) +> nlp.vocab["spaCy"].set_flag(COOL_FLAG, True) > ``` | Name | Type | Description | @@ -42,9 +42,9 @@ Check the value of a boolean flag. > #### Example > > ```python -> is_my_library = lambda text: text in [u"spaCy", u"Thinc"] +> is_my_library = lambda text: text in ["spaCy", "Thinc"] > MY_LIBRARY = nlp.vocab.add_flag(is_my_library) -> assert nlp.vocab[u"spaCy"].check_flag(MY_LIBRARY) == True +> assert nlp.vocab["spaCy"].check_flag(MY_LIBRARY) == True > ``` | Name | Type | Description | @@ -59,8 +59,8 @@ Compute a semantic similarity estimate. Defaults to cosine over vectors. > #### Example > > ```python -> apple = nlp.vocab[u"apple"] -> orange = nlp.vocab[u"orange"] +> apple = nlp.vocab["apple"] +> orange = nlp.vocab["orange"] > apple_orange = apple.similarity(orange) > orange_apple = orange.similarity(apple) > assert apple_orange == orange_apple @@ -78,7 +78,7 @@ A boolean value indicating whether a word vector is associated with the lexeme. > #### Example > > ```python -> apple = nlp.vocab[u"apple"] +> apple = nlp.vocab["apple"] > assert apple.has_vector > ``` @@ -93,7 +93,7 @@ A real-valued meaning representation. > #### Example > > ```python -> apple = nlp.vocab[u"apple"] +> apple = nlp.vocab["apple"] > assert apple.vector.dtype == "float32" > assert apple.vector.shape == (300,) > ``` @@ -109,8 +109,8 @@ The L2 norm of the lexeme's vector representation. > #### Example > > ```python -> apple = nlp.vocab[u"apple"] -> pasta = nlp.vocab[u"pasta"] +> apple = nlp.vocab["apple"] +> pasta = nlp.vocab["pasta"] > apple.vector_norm # 7.1346845626831055 > pasta.vector_norm # 7.759851932525635 > assert apple.vector_norm != pasta.vector_norm diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index fb0ba1617..84d9ed888 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -50,7 +50,7 @@ Find all token sequences matching the supplied patterns on the `Doc`. > matcher = Matcher(nlp.vocab) > pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] > matcher.add("HelloWorld", None, pattern) -> doc = nlp(u'hello world!') +> doc = nlp("hello world!") > matches = matcher(doc) > ``` @@ -147,7 +147,7 @@ overwritten. > matcher = Matcher(nlp.vocab) > matcher.add("HelloWorld", on_match, [{"LOWER": "hello"}, {"LOWER": "world"}]) > matcher.add("GoogleMaps", on_match, [{"ORTH": "Google"}, {"ORTH": "Maps"}]) -> doc = nlp(u"HELLO WORLD on Google Maps.") +> doc = nlp("HELLO WORLD on Google Maps.") > matches = matcher(doc) > ``` diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index c61fa575d..36a412e34 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -59,8 +59,8 @@ Find all token sequences matching the supplied patterns on the `Doc`. > from spacy.matcher import PhraseMatcher > > matcher = PhraseMatcher(nlp.vocab) -> matcher.add("OBAMA", None, nlp(u"Barack Obama")) -> doc = nlp(u"Barack Obama lifts America one last time in emotional farewell") +> matcher.add("OBAMA", None, nlp("Barack Obama")) +> doc = nlp("Barack Obama lifts America one last time in emotional farewell") > matches = matcher(doc) > ``` @@ -99,7 +99,7 @@ patterns. > ```python > matcher = PhraseMatcher(nlp.vocab) > assert len(matcher) == 0 -> matcher.add("OBAMA", None, nlp(u"Barack Obama")) +> matcher.add("OBAMA", None, nlp("Barack Obama")) > assert len(matcher) == 1 > ``` @@ -116,7 +116,7 @@ Check whether the matcher contains rules for a match ID. > ```python > matcher = PhraseMatcher(nlp.vocab) > assert "OBAMA" not in matcher -> matcher.add("OBAMA", None, nlp(u"Barack Obama")) +> matcher.add("OBAMA", None, nlp("Barack Obama")) > assert "OBAMA" in matcher > ``` @@ -140,10 +140,10 @@ overwritten. > print('Matched!', matches) > > matcher = PhraseMatcher(nlp.vocab) -> matcher.add("OBAMA", on_match, nlp(u"Barack Obama")) -> matcher.add("HEALTH", on_match, nlp(u"health care reform"), -> nlp(u"healthcare reform")) -> doc = nlp(u"Barack Obama urges Congress to find courage to defend his healthcare reforms") +> matcher.add("OBAMA", on_match, nlp("Barack Obama")) +> matcher.add("HEALTH", on_match, nlp("health care reform"), +> nlp("healthcare reform")) +> doc = nlp("Barack Obama urges Congress to find courage to defend his healthcare reforms") > matches = matcher(doc) > ``` diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md index 63b3cd164..6e2b473b1 100644 --- a/website/docs/api/pipeline-functions.md +++ b/website/docs/api/pipeline-functions.md @@ -17,13 +17,13 @@ the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe). > #### Example > > ```python -> texts = [t.text for t in nlp(u"I have a blue car")] +> texts = [t.text for t in nlp("I have a blue car")] > assert texts == ["I", "have", "a", "blue", "car"] > > merge_nps = nlp.create_pipe("merge_noun_chunks") > nlp.add_pipe(merge_nps) > -> texts = [t.text for t in nlp(u"I have a blue car")] +> texts = [t.text for t in nlp("I have a blue car")] > assert texts == ["I", "have", "a blue car"] > ``` @@ -50,13 +50,13 @@ the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe). > #### Example > > ```python -> texts = [t.text for t in nlp(u"I like David Bowie")] +> texts = [t.text for t in nlp("I like David Bowie")] > assert texts == ["I", "like", "David", "Bowie"] > > merge_ents = nlp.create_pipe("merge_entities") > nlp.add_pipe(merge_ents) > -> texts = [t.text for t in nlp(u"I like David Bowie")] +> texts = [t.text for t in nlp("I like David Bowie")] > assert texts == ["I", "like", "David Bowie"] > ``` diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index 26d205c24..237cd6a8a 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -59,7 +59,7 @@ the component has been added to the pipeline using > nlp = English() > sentencizer = nlp.create_pipe("sentencizer") > nlp.add_pipe(sentencizer) -> doc = nlp(u"This is a sentence. This is another sentence.") +> doc = nlp("This is a sentence. This is another sentence.") > assert list(doc.sents) == 2 > ``` diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 79be81ef8..7e3ce19d0 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -13,13 +13,13 @@ Create a Span object from the slice `doc[start : end]`. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > span = doc[1:4] -> assert [t.text for t in span] == [u"it", u"back", u"!"] +> assert [t.text for t in span] == ["it", "back", "!"] > ``` | Name | Type | Description | -| ----------- | ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------| +| ----------- | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------- | | `doc` | `Doc` | The parent document. | | `start` | int | The index of the first token of the span. | | `end` | int | The index of the first token after the span. | @@ -35,7 +35,7 @@ Get a `Token` object. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > span = doc[1:4] > assert span[1].text == "back" > ``` @@ -50,9 +50,9 @@ Get a `Span` object. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > span = doc[1:4] -> assert span[1:3].text == u"back!" +> assert span[1:3].text == "back!" > ``` | Name | Type | Description | @@ -67,9 +67,9 @@ Iterate over `Token` objects. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > span = doc[1:4] -> assert [t.text for t in span] == [u"it", u"back", u"!"] +> assert [t.text for t in span] == ["it", "back", "!"] > ``` | Name | Type | Description | @@ -83,7 +83,7 @@ Get the number of tokens in the span. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > span = doc[1:4] > assert len(span) == 3 > ``` @@ -102,9 +102,9 @@ For details, see the documentation on > > ```python > from spacy.tokens import Span -> city_getter = lambda span: any(city in span.text for city in (u"New York", u"Paris", u"Berlin")) +> city_getter = lambda span: any(city in span.text for city in ("New York", "Paris", "Berlin")) > Span.set_extension("has_city", getter=city_getter) -> doc = nlp(u"I like New York in Autumn") +> doc = nlp("I like New York in Autumn") > assert doc[1:4]._.has_city > ``` @@ -180,7 +180,7 @@ using an average of word vectors. > #### Example > > ```python -> doc = nlp(u"green apples and red oranges") +> doc = nlp("green apples and red oranges") > green_apples = doc[:2] > red_oranges = doc[3:] > apples_oranges = green_apples.similarity(red_oranges) @@ -202,7 +202,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn") +> doc = nlp("I like New York in Autumn") > span = doc[1:4] > matrix = span.get_lca_matrix() > # array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32) @@ -222,7 +222,7 @@ shape `(N, M)`, where `N` is the length of the document. The values will be > > ```python > from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > span = doc[2:3] > # All strings mapped to integers, for easy export to numpy > np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) @@ -248,11 +248,11 @@ Retokenize the document, such that the span is merged into a single token. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > span = doc[2:4] > span.merge() > assert len(doc) == 6 -> assert doc[2].text == u"New York" +> assert doc[2].text == "New York" > ``` | Name | Type | Description | @@ -268,12 +268,12 @@ if the entity recognizer has been applied. > #### Example > > ```python -> doc = nlp(u"Mr. Best flew to New York on Saturday morning.") +> doc = nlp("Mr. Best flew to New York on Saturday morning.") > span = doc[0:6] > ents = list(span.ents) > assert ents[0].label == 346 > assert ents[0].label_ == "PERSON" -> assert ents[0].text == u"Mr. Best" +> assert ents[0].text == "Mr. Best" > ``` | Name | Type | Description | @@ -287,10 +287,10 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > span = doc[2:4] > doc2 = span.as_doc() -> assert doc2.text == u"New York" +> assert doc2.text == "New York" > ``` | Name | Type | Description | @@ -306,12 +306,12 @@ taken. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > i, like, new, york, in_, autumn, dot = range(len(doc)) -> assert doc[new].head.text == u"York" -> assert doc[york].head.text == u"like" +> assert doc[new].head.text == "York" +> assert doc[york].head.text == "like" > new_york = doc[new:york+1] -> assert new_york.root.text == u"York" +> assert new_york.root.text == "York" > ``` | Name | Type | Description | @@ -325,9 +325,9 @@ A tuple of tokens coordinated to `span.root`. > #### Example > > ```python -> doc = nlp(u"I like apples and oranges") +> doc = nlp("I like apples and oranges") > apples_conjuncts = doc[2:3].conjuncts -> assert [t.text for t in apples_conjuncts] == [u"oranges"] +> assert [t.text for t in apples_conjuncts] == ["oranges"] > ``` | Name | Type | Description | @@ -341,9 +341,9 @@ Tokens that are to the left of the span, whose heads are within the span. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > lefts = [t.text for t in doc[3:7].lefts] -> assert lefts == [u"New"] +> assert lefts == ["New"] > ``` | Name | Type | Description | @@ -357,9 +357,9 @@ Tokens that are to the right of the span, whose heads are within the span. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > rights = [t.text for t in doc[2:4].rights] -> assert rights == [u"in"] +> assert rights == ["in"] > ``` | Name | Type | Description | @@ -374,7 +374,7 @@ the span. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > assert doc[3:7].n_lefts == 1 > ``` @@ -390,7 +390,7 @@ the span. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > assert doc[2:4].n_rights == 1 > ``` @@ -405,9 +405,9 @@ Tokens within the span and tokens which descend from them. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > subtree = [t.text for t in doc[:3].subtree] -> assert subtree == [u"Give", u"it", u"back", u"!"] +> assert subtree == ["Give", "it", "back", "!"] > ``` | Name | Type | Description | @@ -421,7 +421,7 @@ A boolean value indicating whether a word vector is associated with the object. > #### Example > > ```python -> doc = nlp(u"I like apples") +> doc = nlp("I like apples") > assert doc[1:].has_vector > ``` @@ -437,7 +437,7 @@ vectors. > #### Example > > ```python -> doc = nlp(u"I like apples") +> doc = nlp("I like apples") > assert doc[1:].vector.dtype == "float32" > assert doc[1:].vector.shape == (300,) > ``` @@ -453,7 +453,7 @@ The L2 norm of the span's vector representation. > #### Example > > ```python -> doc = nlp(u"I like apples") +> doc = nlp("I like apples") > doc[1:].vector_norm # 4.800883928527915 > doc[2:].vector_norm # 6.895897646384268 > assert doc[1:].vector_norm != doc[2:].vector_norm diff --git a/website/docs/api/stringstore.md b/website/docs/api/stringstore.md index 40d27a62a..268f19125 100644 --- a/website/docs/api/stringstore.md +++ b/website/docs/api/stringstore.md @@ -16,7 +16,7 @@ Create the `StringStore`. > > ```python > from spacy.strings import StringStore -> stringstore = StringStore([u"apple", u"orange"]) +> stringstore = StringStore(["apple", "orange"]) > ``` | Name | Type | Description | @@ -31,7 +31,7 @@ Get the number of strings in the store. > #### Example > > ```python -> stringstore = StringStore([u"apple", u"orange"]) +> stringstore = StringStore(["apple", "orange"]) > assert len(stringstore) == 2 > ``` @@ -46,10 +46,10 @@ Retrieve a string from a given hash, or vice versa. > #### Example > > ```python -> stringstore = StringStore([u"apple", u"orange"]) -> apple_hash = stringstore[u"apple"] +> stringstore = StringStore(["apple", "orange"]) +> apple_hash = stringstore["apple"] > assert apple_hash == 8566208034543834098 -> assert stringstore[apple_hash] == u"apple" +> assert stringstore[apple_hash] == "apple" > ``` | Name | Type | Description | @@ -64,9 +64,9 @@ Check whether a string is in the store. > #### Example > > ```python -> stringstore = StringStore([u"apple", u"orange"]) -> assert u"apple" in stringstore -> assert not u"cherry" in stringstore +> stringstore = StringStore(["apple", "orange"]) +> assert "apple" in stringstore +> assert not "cherry" in stringstore > ``` | Name | Type | Description | @@ -82,9 +82,9 @@ store will always include an empty string `''` at position `0`. > #### Example > > ```python -> stringstore = StringStore([u"apple", u"orange"]) +> stringstore = StringStore(["apple", "orange"]) > all_strings = [s for s in stringstore] -> assert all_strings == [u"apple", u"orange"] +> assert all_strings == ["apple", "orange"] > ``` | Name | Type | Description | @@ -98,12 +98,12 @@ Add a string to the `StringStore`. > #### Example > > ```python -> stringstore = StringStore([u"apple", u"orange"]) -> banana_hash = stringstore.add(u"banana") +> stringstore = StringStore(["apple", "orange"]) +> banana_hash = stringstore.add("banana") > assert len(stringstore) == 3 > assert banana_hash == 2525716904149915114 -> assert stringstore[banana_hash] == u"banana" -> assert stringstore[u"banana"] == banana_hash +> assert stringstore[banana_hash] == "banana" +> assert stringstore["banana"] == banana_hash > ``` | Name | Type | Description | @@ -182,7 +182,7 @@ Get a 64-bit hash for a given string. > > ```python > from spacy.strings import hash_string -> assert hash_string(u"apple") == 8566208034543834098 +> assert hash_string("apple") == 8566208034543834098 > ``` | Name | Type | Description | diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index fc6fc67a6..bd3382f89 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -57,7 +57,7 @@ and all pipeline components are applied to the `Doc` in order. Both > > ```python > tagger = Tagger(nlp.vocab) -> doc = nlp(u"This is a sentence.") +> doc = nlp("This is a sentence.") > # This usually happens under the hood > processed = tagger(doc) > ``` @@ -117,12 +117,11 @@ Modify a batch of documents, using pre-computed scores. > tagger.set_annotations([doc1, doc2], scores, tensors) > ``` -| Name | Type | Description | -| -------- | -------- | ----------------------------------------------------- | -| `docs` | iterable | The documents to modify. | -| `scores` | - | The scores to set, produced by `Tagger.predict`. | -| `tensors`| iterable | The token representations used to predict the scores. | - +| Name | Type | Description | +| --------- | -------- | ----------------------------------------------------- | +| `docs` | iterable | The documents to modify. | +| `scores` | - | The scores to set, produced by `Tagger.predict`. | +| `tensors` | iterable | The token representations used to predict the scores. | ## Tagger.update {#update tag="method"} diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index f7158541b..1a0280265 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -75,7 +75,7 @@ delegate to the [`predict`](/api/textcategorizer#predict) and > > ```python > textcat = TextCategorizer(nlp.vocab) -> doc = nlp(u"This is a sentence.") +> doc = nlp("This is a sentence.") > # This usually happens under the hood > processed = textcat(doc) > ``` @@ -136,11 +136,11 @@ Modify a batch of documents, using pre-computed scores. > textcat.set_annotations([doc1, doc2], scores, tensors) > ``` -| Name | Type | Description | -| -------- | -------- | --------------------------------------------------------- | -| `docs` | iterable | The documents to modify. | -| `scores` | - | The scores to set, produced by `TextCategorizer.predict`. | -| `tensors`| iterable | The token representations used to predict the scores. | +| Name | Type | Description | +| --------- | -------- | --------------------------------------------------------- | +| `docs` | iterable | The documents to modify. | +| `scores` | - | The scores to set, produced by `TextCategorizer.predict`. | +| `tensors` | iterable | The token representations used to predict the scores. | ## TextCategorizer.update {#update tag="method"} diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 8da13454b..8d7ee5928 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -12,9 +12,9 @@ Construct a `Token` object. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > token = doc[0] -> assert token.text == u"Give" +> assert token.text == "Give" > ``` | Name | Type | Description | @@ -31,7 +31,7 @@ The number of unicode characters in the token, i.e. `token.text`. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > token = doc[0] > assert len(token) == 4 > ``` @@ -50,9 +50,9 @@ For details, see the documentation on > > ```python > from spacy.tokens import Token -> fruit_getter = lambda token: token.text in (u"apple", u"pear", u"banana") +> fruit_getter = lambda token: token.text in ("apple", "pear", "banana") > Token.set_extension("is_fruit", getter=fruit_getter) -> doc = nlp(u"I have an apple") +> doc = nlp("I have an apple") > assert doc[3]._.is_fruit > ``` @@ -128,7 +128,7 @@ Check the value of a boolean flag. > > ```python > from spacy.attrs import IS_TITLE -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > token = doc[0] > assert token.check_flag(IS_TITLE) == True > ``` @@ -145,7 +145,7 @@ Compute a semantic similarity estimate. Defaults to cosine over vectors. > #### Example > > ```python -> apples, _, oranges = nlp(u"apples and oranges") +> apples, _, oranges = nlp("apples and oranges") > apples_oranges = apples.similarity(oranges) > oranges_apples = oranges.similarity(apples) > assert apples_oranges == oranges_apples @@ -163,9 +163,9 @@ Get a neighboring token. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > give_nbor = doc[0].nbor() -> assert give_nbor.text == u"it" +> assert give_nbor.text == "it" > ``` | Name | Type | Description | @@ -181,7 +181,7 @@ dependency tree. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > give = doc[0] > it = doc[1] > assert give.is_ancestor(it) @@ -199,11 +199,11 @@ The rightmost token of this token's syntactic descendants. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > it_ancestors = doc[1].ancestors -> assert [t.text for t in it_ancestors] == [u"Give"] +> assert [t.text for t in it_ancestors] == ["Give"] > he_ancestors = doc[4].ancestors -> assert [t.text for t in he_ancestors] == [u"pleaded"] +> assert [t.text for t in he_ancestors] == ["pleaded"] > ``` | Name | Type | Description | @@ -217,9 +217,9 @@ A tuple of coordinated tokens, not including the token itself. > #### Example > > ```python -> doc = nlp(u"I like apples and oranges") +> doc = nlp("I like apples and oranges") > apples_conjuncts = doc[2].conjuncts -> assert [t.text for t in apples_conjuncts] == [u"oranges"] +> assert [t.text for t in apples_conjuncts] == ["oranges"] > ``` | Name | Type | Description | @@ -233,9 +233,9 @@ A sequence of the token's immediate syntactic children. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > give_children = doc[0].children -> assert [t.text for t in give_children] == [u"it", u"back", u"!"] +> assert [t.text for t in give_children] == ["it", "back", "!"] > ``` | Name | Type | Description | @@ -249,9 +249,9 @@ The leftward immediate children of the word, in the syntactic dependency parse. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > lefts = [t.text for t in doc[3].lefts] -> assert lefts == [u'New'] +> assert lefts == ["New"] > ``` | Name | Type | Description | @@ -265,9 +265,9 @@ The rightward immediate children of the word, in the syntactic dependency parse. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > rights = [t.text for t in doc[3].rights] -> assert rights == [u"in"] +> assert rights == ["in"] > ``` | Name | Type | Description | @@ -282,7 +282,7 @@ dependency parse. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > assert doc[3].n_lefts == 1 > ``` @@ -298,7 +298,7 @@ dependency parse. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > assert doc[3].n_rights == 1 > ``` @@ -313,9 +313,9 @@ A sequence containing the token and all the token's syntactic descendants. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > give_subtree = doc[0].subtree -> assert [t.text for t in give_subtree] == [u"Give", u"it", u"back", u"!"] +> assert [t.text for t in give_subtree] == ["Give", "it", "back", "!"] > ``` | Name | Type | Description | @@ -330,7 +330,7 @@ unknown. Defaults to `True` for the first token in the `Doc`. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > assert doc[4].is_sent_start > assert not doc[5].is_sent_start > ``` @@ -361,7 +361,7 @@ A boolean value indicating whether a word vector is associated with the token. > #### Example > > ```python -> doc = nlp(u"I like apples") +> doc = nlp("I like apples") > apples = doc[2] > assert apples.has_vector > ``` @@ -377,7 +377,7 @@ A real-valued meaning representation. > #### Example > > ```python -> doc = nlp(u"I like apples") +> doc = nlp("I like apples") > apples = doc[2] > assert apples.vector.dtype == "float32" > assert apples.vector.shape == (300,) @@ -394,7 +394,7 @@ The L2 norm of the token's vector representation. > #### Example > > ```python -> doc = nlp(u"I like apples and pasta") +> doc = nlp("I like apples and pasta") > apples = doc[2] > pasta = doc[4] > apples.vector_norm # 6.89589786529541 @@ -425,7 +425,7 @@ The L2 norm of the token's vector representation. | `i` | int | The index of the token within the parent document. | | `ent_type` | int | Named entity type. | | `ent_type_` | unicode | Named entity type. | -| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | +| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | | `ent_iob_` | unicode | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | | `ent_kb_id` <Tag variant="new">2.2</Tag> | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | | `ent_kb_id_` <Tag variant="new">2.2</Tag> | unicode | Knowledge base ID that refers to the named entity this token is a part of, if any. | diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index ce1ba9a21..63c1e87ea 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -5,7 +5,9 @@ tag: class source: spacy/tokenizer.pyx --- -Segment text, and create `Doc` objects with the discovered segment boundaries. For a deeper understanding, see the docs on [how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works). +Segment text, and create `Doc` objects with the discovered segment boundaries. +For a deeper understanding, see the docs on +[how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works). ## Tokenizer.\_\_init\_\_ {#init tag="method"} @@ -49,7 +51,7 @@ Tokenize a string. > #### Example > > ```python -> tokens = tokenizer(u"This is a sentence") +> tokens = tokenizer("This is a sentence") > assert len(tokens) == 4 > ``` @@ -65,7 +67,7 @@ Tokenize a stream of texts. > #### Example > > ```python -> texts = [u"One document.", u"...", u"Lots of documents"] +> texts = ["One document.", "...", "Lots of documents"] > for doc in tokenizer.pipe(texts, batch_size=50): > pass > ``` @@ -109,8 +111,9 @@ if no suffix rules match. Add a special-case tokenization rule. This mechanism is also used to add custom tokenizer exceptions to the language data. See the usage guide on -[adding languages](/usage/adding-languages#tokenizer-exceptions) and [linguistic features](/usage/linguistic-features#special-cases) for more -details and examples. +[adding languages](/usage/adding-languages#tokenizer-exceptions) and +[linguistic features](/usage/linguistic-features#special-cases) for more details +and examples. > #### Example > diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index e9bf48869..0a8f638b2 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -112,10 +112,10 @@ list of available terms, see > #### Example > > ```python -> spacy.explain(u"NORP") +> spacy.explain("NORP") > # Nationalities or religious or political groups > -> doc = nlp(u"Hello world") +> doc = nlp("Hello world") > for word in doc: > print(word.text, word.tag_, spacy.explain(word.tag_)) > # Hello UH interjection @@ -181,8 +181,8 @@ browser. Will run a simple web server. > import spacy > from spacy import displacy > nlp = spacy.load("en_core_web_sm") -> doc1 = nlp(u"This is a sentence.") -> doc2 = nlp(u"This is another sentence.") +> doc1 = nlp("This is a sentence.") +> doc2 = nlp("This is another sentence.") > displacy.serve([doc1, doc2], style="dep") > ``` @@ -192,7 +192,7 @@ browser. Will run a simple web server. | `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` | | `page` | bool | Render markup as full HTML page. | `True` | | `minify` | bool | Minify HTML markup. | `False` | -| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | +| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | | `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | | `port` | int | Port to serve visualization. | `5000` | | `host` | unicode | Host to serve visualization. | `'0.0.0.0'` | @@ -207,7 +207,7 @@ Render a dependency parse tree or named entity visualization. > import spacy > from spacy import displacy > nlp = spacy.load("en_core_web_sm") -> doc = nlp(u"This is a sentence.") +> doc = nlp("This is a sentence.") > html = displacy.render(doc, style="dep") > ``` @@ -218,7 +218,7 @@ Render a dependency parse tree or named entity visualization. | `page` | bool | Render markup as full HTML page. | `False` | | `minify` | bool | Minify HTML markup. | `False` | | `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` | -| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | +| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | | `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | | **RETURNS** | unicode | Rendered HTML markup. | @@ -262,16 +262,18 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="ent", options=options) > ``` -| Name | Type | Description | Default | -| -------- | ---- | ------------------------------------------------------------------------------------- | ------- | -| `ents` | list | Entity types to highlight (`None` for all types). | `None` | -| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` | +| Name | Type | Description | Default | +| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ | +| `ents` | list | Entity types to highlight (`None` for all types). | `None` | +| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` | | `template` <Tag variant="new">2.2</Tag> | unicode | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) | By default, displaCy comes with colors for all [entity types supported by spaCy](/api/annotation#named-entities). If you're using custom entity types, you can use the `colors` setting to add your own -colors for them. Your application or model package can also expose a [`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy) to add custom labels and their colors automatically. +colors for them. Your application or model package can also expose a +[`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy) +to add custom labels and their colors automatically. ## Utility functions {#util source="spacy/util.py"} @@ -649,11 +651,11 @@ for batching. Larger `bufsize` means less bias. > shuffled = itershuffle(values) > ``` -| Name | Type | Description | -| ---------- | -------- | ------------------------------------- | -| `iterable` | iterable | Iterator to shuffle. | -| `bufsize` | int | Items to hold back (default: 1000). | -| **YIELDS** | iterable | The shuffled iterator. | +| Name | Type | Description | +| ---------- | -------- | ----------------------------------- | +| `iterable` | iterable | Iterator to shuffle. | +| `bufsize` | int | Items to hold back (default: 1000). | +| **YIELDS** | iterable | The shuffled iterator. | ### util.filter_spans {#util.filter_spans tag="function" new="2.1.4"} diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index ffc1fc083..bfe0e5f3f 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -26,7 +26,7 @@ you can add vectors to later. > empty_vectors = Vectors(shape=(10000, 300)) > > data = numpy.zeros((3, 300), dtype='f') -> keys = [u"cat", u"dog", u"rat"] +> keys = ["cat", "dog", "rat"] > vectors = Vectors(data=data, keys=keys) > ``` @@ -45,9 +45,9 @@ raised. > #### Example > > ```python -> cat_id = nlp.vocab.strings[u"cat"] +> cat_id = nlp.vocab.strings["cat"] > cat_vector = nlp.vocab.vectors[cat_id] -> assert cat_vector == nlp.vocab[u"cat"].vector +> assert cat_vector == nlp.vocab["cat"].vector > ``` | Name | Type | Description | @@ -62,7 +62,7 @@ Set a vector for the given key. > #### Example > > ```python -> cat_id = nlp.vocab.strings[u"cat"] +> cat_id = nlp.vocab.strings["cat"] > vector = numpy.random.uniform(-1, 1, (300,)) > nlp.vocab.vectors[cat_id] = vector > ``` @@ -109,7 +109,7 @@ Check whether a key has been mapped to a vector entry in the table. > #### Example > > ```python -> cat_id = nlp.vocab.strings[u"cat"] +> cat_id = nlp.vocab.strings["cat"] > nlp.vectors.add(cat_id, numpy.random.uniform(-1, 1, (300,))) > assert cat_id in vectors > ``` @@ -132,9 +132,9 @@ mapping separately. If you need to manage the strings, you should use the > > ```python > vector = numpy.random.uniform(-1, 1, (300,)) -> cat_id = nlp.vocab.strings[u"cat"] +> cat_id = nlp.vocab.strings["cat"] > nlp.vocab.vectors.add(cat_id, vector=vector) -> nlp.vocab.vectors.add(u"dog", row=0) +> nlp.vocab.vectors.add("dog", row=0) > ``` | Name | Type | Description | @@ -218,8 +218,8 @@ Look up one or more keys by row, or vice versa. > #### Example > > ```python -> row = nlp.vocab.vectors.find(key=u"cat") -> rows = nlp.vocab.vectors.find(keys=[u"cat", u"dog"]) +> row = nlp.vocab.vectors.find(key="cat") +> rows = nlp.vocab.vectors.find(keys=["cat", "dog"]) > key = nlp.vocab.vectors.find(row=256) > keys = nlp.vocab.vectors.find(rows=[18, 256, 985]) > ``` @@ -241,7 +241,7 @@ vector table. > > ```python > vectors = Vectors(shape(1, 300)) -> vectors.add(u"cat", numpy.random.uniform(-1, 1, (300,))) +> vectors.add("cat", numpy.random.uniform(-1, 1, (300,))) > rows, dims = vectors.shape > assert rows == 1 > assert dims == 300 @@ -276,7 +276,7 @@ If a table is full, it can be resized using > > ```python > vectors = Vectors(shape=(1, 300)) -> vectors.add(u"cat", numpy.random.uniform(-1, 1, (300,))) +> vectors.add("cat", numpy.random.uniform(-1, 1, (300,))) > assert vectors.is_full > ``` diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index 22bfe324e..78e5f7541 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -18,7 +18,7 @@ Create the vocabulary. > > ```python > from spacy.vocab import Vocab -> vocab = Vocab(strings=[u"hello", u"world"]) +> vocab = Vocab(strings=["hello", "world"]) > ``` | Name | Type | Description | @@ -36,7 +36,7 @@ Get the current number of lexemes in the vocabulary. > #### Example > > ```python -> doc = nlp(u"This is a sentence.") +> doc = nlp("This is a sentence.") > assert len(nlp.vocab) > 0 > ``` @@ -52,8 +52,8 @@ unicode string is given, a new lexeme is created and stored. > #### Example > > ```python -> apple = nlp.vocab.strings[u"apple"] -> assert nlp.vocab[apple] == nlp.vocab[u"apple"] +> apple = nlp.vocab.strings["apple"] +> assert nlp.vocab[apple] == nlp.vocab["apple"] > ``` | Name | Type | Description | @@ -84,8 +84,8 @@ given string, you need to look it up in > #### Example > > ```python -> apple = nlp.vocab.strings[u"apple"] -> oov = nlp.vocab.strings[u"dskfodkfos"] +> apple = nlp.vocab.strings["apple"] +> oov = nlp.vocab.strings["dskfodkfos"] > assert apple in nlp.vocab > assert oov not in nlp.vocab > ``` @@ -106,11 +106,11 @@ using `token.check_flag(flag_id)`. > > ```python > def is_my_product(text): -> products = [u"spaCy", u"Thinc", u"displaCy"] +> products = ["spaCy", "Thinc", "displaCy"] > return text in products > > MY_PRODUCT = nlp.vocab.add_flag(is_my_product) -> doc = nlp(u"I like spaCy") +> doc = nlp("I like spaCy") > assert doc[2].check_flag(MY_PRODUCT) == True > ``` @@ -170,7 +170,7 @@ or hash value. If no vectors data is loaded, a `ValueError` is raised. > #### Example > > ```python -> nlp.vocab.get_vector(u"apple") +> nlp.vocab.get_vector("apple") > ``` | Name | Type | Description | @@ -186,7 +186,7 @@ or hash value. > #### Example > > ```python -> nlp.vocab.set_vector(u"apple", array([...])) +> nlp.vocab.set_vector("apple", array([...])) > ``` | Name | Type | Description | @@ -202,8 +202,8 @@ Words can be looked up by string or hash value. > #### Example > > ```python -> if nlp.vocab.has_vector(u"apple"): -> vector = nlp.vocab.get_vector(u"apple") +> if nlp.vocab.has_vector("apple"): +> vector = nlp.vocab.get_vector("apple") > ``` | Name | Type | Description | @@ -282,9 +282,9 @@ Load state from a binary string. > #### Example > > ```python -> apple_id = nlp.vocab.strings[u"apple"] +> apple_id = nlp.vocab.strings["apple"] > assert type(apple_id) == int -> PERSON = nlp.vocab.strings[u"PERSON"] +> PERSON = nlp.vocab.strings["PERSON"] > assert type(PERSON) == int > ``` diff --git a/website/docs/usage/101/_named-entities.md b/website/docs/usage/101/_named-entities.md index a282ec370..1ecaf9fe7 100644 --- a/website/docs/usage/101/_named-entities.md +++ b/website/docs/usage/101/_named-entities.md @@ -1,5 +1,5 @@ A named entity is a "real-world object" that's assigned a name – for example, a -person, a country, a product or a book title. spaCy can **recognize** +person, a country, a product or a book title. spaCy can **recognize** [various types](/api/annotation#named-entities) of named entities in a document, by asking the model for a **prediction**. Because models are statistical and strongly depend on the examples they were trained on, this doesn't always work @@ -12,7 +12,7 @@ Named entities are available as the `ents` property of a `Doc`: import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion") +doc = nlp("Apple is looking at buying U.K. startup for $1 billion") for ent in doc.ents: print(ent.text, ent.start_char, ent.end_char, ent.label_) @@ -23,10 +23,10 @@ for ent in doc.ents: > - **End:** Index of end of entity in the `Doc`. > - **Label:** Entity label, i.e. type. -| Text | Start | End | Label | Description | -| ----------- | :---: | :-: | ------- | ---------------------------------------------------- | -| Apple | 0 | 5 | `ORG` | Companies, agencies, institutions. | -| U.K. | 27 | 31 | `GPE` | Geopolitical entity, i.e. countries, cities, states. | +| Text | Start | End | Label | Description | +| ----------- | :---: | :-: | ------- | ---------------------------------------------------- | +| Apple | 0 | 5 | `ORG` | Companies, agencies, institutions. | +| U.K. | 27 | 31 | `GPE` | Geopolitical entity, i.e. countries, cities, states. | | \$1 billion | 44 | 54 | `MONEY` | Monetary values, including unit. | Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what diff --git a/website/docs/usage/101/_pos-deps.md b/website/docs/usage/101/_pos-deps.md index d86ee123d..b0e2b33b8 100644 --- a/website/docs/usage/101/_pos-deps.md +++ b/website/docs/usage/101/_pos-deps.md @@ -15,8 +15,8 @@ need to add an underscore `_` to its name: ### {executable="true"} import spacy -nlp = spacy.load('en_core_web_sm') -doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') +nlp = spacy.load("en_core_web_sm") +doc = nlp("Apple is looking at buying U.K. startup for $1 billion") for token in doc: print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, diff --git a/website/docs/usage/101/_tokenization.md b/website/docs/usage/101/_tokenization.md index e5f3d3080..764f1e62a 100644 --- a/website/docs/usage/101/_tokenization.md +++ b/website/docs/usage/101/_tokenization.md @@ -9,7 +9,7 @@ tokens, and we can iterate over them: import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion") +doc = nlp("Apple is looking at buying U.K. startup for $1 billion") for token in doc: print(token.text) ``` diff --git a/website/docs/usage/101/_vectors-similarity.md b/website/docs/usage/101/_vectors-similarity.md index 2001d1481..73c35950f 100644 --- a/website/docs/usage/101/_vectors-similarity.md +++ b/website/docs/usage/101/_vectors-similarity.md @@ -48,8 +48,8 @@ norm, which can be used to normalize vectors. ### {executable="true"} import spacy -nlp = spacy.load('en_core_web_md') -tokens = nlp(u'dog cat banana afskfsd') +nlp = spacy.load("en_core_web_md") +tokens = nlp("dog cat banana afskfsd") for token in tokens: print(token.text, token.has_vector, token.vector_norm, token.is_oov) @@ -88,8 +88,8 @@ definition of similarity. ### {executable="true"} import spacy -nlp = spacy.load('en_core_web_md') # make sure to use larger model! -tokens = nlp(u'dog cat banana') +nlp = spacy.load("en_core_web_md") # make sure to use larger model! +tokens = nlp("dog cat banana") for token1 in tokens: for token2 in tokens: diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md index 6f8955326..d89891297 100644 --- a/website/docs/usage/adding-languages.md +++ b/website/docs/usage/adding-languages.md @@ -276,7 +276,7 @@ the lowercase spelling of a word exists, norms should always be in lowercase. > #### Norms vs. lemmas > > ```python -> doc = nlp(u"I'm gonna realise") +> doc = nlp("I'm gonna realise") > norms = [token.norm_ for token in doc] > lemmas = [token.lemma_ for token in doc] > assert norms == ["i", "am", "going", "to", "realize"] @@ -396,10 +396,10 @@ iterators: > #### Noun chunks example > > ```python -> doc = nlp(u"A phrase with another phrase occurs.") +> doc = nlp("A phrase with another phrase occurs.") > chunks = list(doc.noun_chunks) -> assert chunks[0].text == u"A phrase" -> assert chunks[1].text == u"another phrase" +> assert chunks[0].text == "A phrase" +> assert chunks[1].text == "another phrase" > ``` | Language | Code | Source | diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index 1ffd0de0d..1d6c0574c 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -392,7 +392,7 @@ from is called `spacy`. So, when using spaCy, never call anything else `spacy`. <Accordion title="Pronoun lemma is returned as -PRON-" id="pron-lemma"> ```python -doc = nlp(u"They are") +doc = nlp("They are") print(doc[0].lemma_) # -PRON- ``` diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index fc1f159ce..a91135d70 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -69,7 +69,6 @@ of the two. The system works as follows: morphological information, without consulting the context of the token. The lemmatizer also accepts list-based exception files, acquired from [WordNet](https://wordnet.princeton.edu/). - ## Dependency Parsing {#dependency-parse model="parser"} @@ -93,7 +92,7 @@ get the noun chunks in a document, simply iterate over import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers") +doc = nlp("Autonomous cars shift insurance liability toward manufacturers") for chunk in doc.noun_chunks: print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text) @@ -124,7 +123,7 @@ get the string value with `.dep_`. import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers") +doc = nlp("Autonomous cars shift insurance liability toward manufacturers") for token in doc: print(token.text, token.dep_, token.head.text, token.head.pos_, [child for child in token.children]) @@ -161,7 +160,7 @@ import spacy from spacy.symbols import nsubj, VERB nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers") +doc = nlp("Autonomous cars shift insurance liability toward manufacturers") # Finding a verb with a subject from below — good verbs = set() @@ -204,7 +203,7 @@ children. import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"bright red apples on the tree") +doc = nlp("bright red apples on the tree") print([token.text for token in doc[2].lefts]) # ['bright', 'red'] print([token.text for token in doc[2].rights]) # ['on'] print(doc[2].n_lefts) # 2 @@ -216,7 +215,7 @@ print(doc[2].n_rights) # 1 import spacy nlp = spacy.load("de_core_news_sm") -doc = nlp(u"schöne rote Äpfel auf dem Baum") +doc = nlp("schöne rote Äpfel auf dem Baum") print([token.text for token in doc[2].lefts]) # ['schöne', 'rote'] print([token.text for token in doc[2].rights]) # ['auf'] ``` @@ -240,7 +239,7 @@ sequence of tokens. You can walk up the tree with the import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Credit and mortgage account holders must submit their requests") +doc = nlp("Credit and mortgage account holders must submit their requests") root = [token for token in doc if token.head == token][0] subject = list(root.lefts)[0] @@ -270,7 +269,7 @@ end-point of a range, don't forget to `+1`! import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Credit and mortgage account holders must submit their requests") +doc = nlp("Credit and mortgage account holders must submit their requests") span = doc[doc[4].left_edge.i : doc[4].right_edge.i+1] with doc.retokenize() as retokenizer: retokenizer.merge(span) @@ -311,7 +310,7 @@ import spacy from spacy import displacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers") +doc = nlp("Autonomous cars shift insurance liability toward manufacturers") # Since this is an interactive Jupyter environment, we can use displacy.render here displacy.render(doc, style='dep') ``` @@ -336,7 +335,7 @@ the `nlp` object. ```python nlp = spacy.load("en_core_web_sm", disable=["parser"]) nlp = English().from_disk("/model", disable=["parser"]) -doc = nlp(u"I don't want parsed", disable=["parser"]) +doc = nlp("I don't want parsed", disable=["parser"]) ``` <Infobox title="Important note: disabling pipeline components" variant="warning"> @@ -350,10 +349,10 @@ Language class via [`from_disk`](/api/language#from_disk). ```diff + nlp = spacy.load("en_core_web_sm", disable=["parser"]) -+ doc = nlp(u"I don't want parsed", disable=["parser"]) ++ doc = nlp("I don't want parsed", disable=["parser"]) - nlp = spacy.load("en_core_web_sm", parser=False) -- doc = nlp(u"I don't want parsed", parse=False) +- doc = nlp("I don't want parsed", parse=False) ``` </Infobox> @@ -398,7 +397,7 @@ on a token, it will return an empty string. import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"San Francisco considers banning sidewalk delivery robots") +doc = nlp("San Francisco considers banning sidewalk delivery robots") # document level ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] @@ -407,8 +406,8 @@ print(ents) # token level ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_] ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_] -print(ent_san) # [u'San', u'B', u'GPE'] -print(ent_francisco) # [u'Francisco', u'I', u'GPE'] +print(ent_san) # ['San', 'B', 'GPE'] +print(ent_francisco) # ['Francisco', 'I', 'GPE'] ``` | Text | ent_iob | ent_iob\_ | ent_type\_ | Description | @@ -435,18 +434,17 @@ import spacy from spacy.tokens import Span nlp = spacy.load("en_core_web_sm") -doc = nlp(u"FB is hiring a new Vice President of global policy") +doc = nlp("FB is hiring a new Vice President of global policy") ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] print('Before', ents) # the model didn't recognise "FB" as an entity :( -ORG = doc.vocab.strings[u"ORG"] # get hash value of entity label -fb_ent = Span(doc, 0, 1, label=ORG) # create a Span for the new entity +fb_ent = Span(doc, 0, 1, label="ORG") # create a Span for the new entity doc.ents = list(doc.ents) + [fb_ent] ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] print('After', ents) -# [(u'FB', 0, 2, 'ORG')] 🎉 +# [('FB', 0, 2, 'ORG')] 🎉 ``` Keep in mind that you need to create a `Span` with the start and end index of @@ -468,13 +466,13 @@ import spacy from spacy.attrs import ENT_IOB, ENT_TYPE nlp = spacy.load("en_core_web_sm") -doc = nlp.make_doc(u"London is a big city in the United Kingdom.") +doc = nlp.make_doc("London is a big city in the United Kingdom.") print("Before", doc.ents) # [] header = [ENT_IOB, ENT_TYPE] attr_array = numpy.zeros((len(doc), len(header))) attr_array[0, 0] = 3 # B -attr_array[0, 1] = doc.vocab.strings[u"GPE"] +attr_array[0, 1] = doc.vocab.strings["GPE"] doc.from_array(header, attr_array) print("After", doc.ents) # [London] ``` @@ -533,8 +531,8 @@ train_data = [ ``` ```python -doc = Doc(nlp.vocab, [u"rats", u"make", u"good", u"pets"]) -gold = GoldParse(doc, entities=[u"U-ANIMAL", u"O", u"O", u"O"]) +doc = Doc(nlp.vocab, ["rats", "make", "good", "pets"]) +gold = GoldParse(doc, entities=["U-ANIMAL", "O", "O", "O"]) ``` <Infobox> @@ -565,7 +563,7 @@ For more details and examples, see the import spacy from spacy import displacy -text = u"When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously." +text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously." nlp = spacy.load("en_core_web_sm") doc = nlp(text) @@ -578,29 +576,27 @@ import DisplacyEntHtml from 'images/displacy-ent2.html' ## Entity Linking {#entity-linking} -To ground the named entities into the "real-world", -spaCy provides functionality to perform entity linking, which resolves a textual entity -to a unique identifier from a knowledge base (KB). - -The default model assigns WikiData identifiers, but you can create your own -[`KnowledgeBase`](/api/kb) and [train a new Entity Linking model](/usage/training#entity-linker) using -that custom-made KB. +To ground the named entities into the "real-world", spaCy provides functionality +to perform entity linking, which resolves a textual entity to a unique +identifier from a knowledge base (KB). +The default model assigns WikiData identifiers, but you can create your own +[`KnowledgeBase`](/api/kb) and +[train a new Entity Linking model](/usage/training#entity-linker) using that +custom-made KB. -### Accessing entity identifiers {#accessing} - -The annotated KB identifier is accessible as either a hash value -or as a string, using the attributes -`ent.kb_id` and `ent.kb_id_` of a [`Span`](/api/span) object, -or the `ent_kb_id` and `ent_kb_id_` attributes of a [`Token`](/api/token) object. +### Accessing entity identifiers {#entity-linking-accessing} +The annotated KB identifier is accessible as either a hash value or as a string, +using the attributes `ent.kb_id` and `ent.kb_id_` of a [`Span`](/api/span) +object, or the `ent_kb_id` and `ent_kb_id_` attributes of a +[`Token`](/api/token) object. ```python -### {executable="true"} import spacy nlp = spacy.load("my_custom_el_model") -doc = nlp(u"Ada Lovelace was born in London") +doc = nlp("Ada Lovelace was born in London") # document level ents = [(e.text, e.label_, e.kb_id_) for e in doc.ents] @@ -615,14 +611,14 @@ print(ent_ada_1) # ['Lovelace', 'PERSON', 'Q7259'] print(ent_london_5) # ['London', 'GPE', 'Q84'] ``` -| Text | ent_type\_ | ent_kb_id\_ | -| --------- | ---------- | ------------ | -| Ada | `"PERSON"` | `"Q7259"` | -| Lovelace | `"PERSON"` | `"Q7259"` | -| was | `""` | `""` | -| born | `""` | `""` | -| in | `""` | `""` | -| London | `"GPE"` | `"Q84"` | +| Text | ent_type\_ | ent_kb_id\_ | +| -------- | ---------- | ----------- | +| Ada | `"PERSON"` | `"Q7259"` | +| Lovelace | `"PERSON"` | `"Q7259"` | +| was | - | - | +| born | - | - | +| in | - | - | +| London | `"GPE"` | `"Q84"` | ## Tokenization {#tokenization} @@ -692,53 +688,36 @@ this specific field. Here's how to add a special case rule to an existing ```python ### {executable="true"} import spacy -from spacy.symbols import ORTH, LEMMA, POS, TAG +from spacy.symbols import ORTH nlp = spacy.load("en_core_web_sm") -doc = nlp(u"gimme that") # phrase to tokenize +doc = nlp("gimme that") # phrase to tokenize print([w.text for w in doc]) # ['gimme', 'that'] -# add special case rule -special_case = [{ORTH: u"gim", LEMMA: u"give", POS: u"VERB"}, {ORTH: u"me"}] -nlp.tokenizer.add_special_case(u"gimme", special_case) +# Add special case rule +special_case = [{ORTH: "gim"}, {ORTH: "me"}] +nlp.tokenizer.add_special_case("gimme", special_case) -# check new tokenization -print([w.text for w in nlp(u"gimme that")]) # ['gim', 'me', 'that'] - -# Pronoun lemma is returned as -PRON-! -print([w.lemma_ for w in nlp(u"gimme that")]) # ['give', '-PRON-', 'that'] +# Check new tokenization +print([w.text for w in nlp("gimme that")]) # ['gim', 'me', 'that'] ``` -<Infobox title="Why -PRON-?" variant="warning"> - -For details on spaCy's custom pronoun lemma `-PRON-`, -[see here](/usage/#pron-lemma). - -</Infobox> - The special case doesn't have to match an entire whitespace-delimited substring. The tokenizer will incrementally split off punctuation, and keep looking up the remaining substring: ```python -assert "gimme" not in [w.text for w in nlp(u"gimme!")] -assert "gimme" not in [w.text for w in nlp(u'("...gimme...?")')] +assert "gimme" not in [w.text for w in nlp("gimme!")] +assert "gimme" not in [w.text for w in nlp('("...gimme...?")')] ``` The special case rules have precedence over the punctuation splitting: ```python -special_case = [{ORTH: u"...gimme...?", LEMMA: u"give", TAG: u"VB"}] -nlp.tokenizer.add_special_case(u"...gimme...?", special_case) -assert len(nlp(u"...gimme...?")) == 1 +nlp.tokenizer.add_special_case("...gimme...?", [{ORTH: "...gimme...?"}]) +assert len(nlp("...gimme...?")) == 1 ``` -Because the special-case rules allow you to set arbitrary token attributes, such -as the part-of-speech, lemma, etc, they make a good mechanism for arbitrary -fix-up rules. Having this logic live in the tokenizer isn't very satisfying from -a design perspective, however, so the API may eventually be exposed on the -[`Language`](/api/language) class itself. - ### How spaCy's tokenizer works {#how-tokenizer-works} spaCy introduces a novel tokenization algorithm, that gives a better balance @@ -838,7 +817,7 @@ def custom_tokenizer(nlp): nlp = spacy.load("en_core_web_sm") nlp.tokenizer = custom_tokenizer(nlp) -doc = nlp(u"hello-world.") +doc = nlp("hello-world.") print([t.text for t in doc]) ``` @@ -955,7 +934,7 @@ class WhitespaceTokenizer(object): nlp = spacy.load("en_core_web_sm") nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) -doc = nlp(u"What's happened to me? he thought. It wasn't a dream.") +doc = nlp("What's happened to me? he thought. It wasn't a dream.") print([t.text for t in doc]) ``` @@ -980,7 +959,7 @@ from spacy.tokens import Doc from spacy.lang.en import English nlp = English() -doc = Doc(nlp.vocab, words=[u"Hello", u",", u"world", u"!"], +doc = Doc(nlp.vocab, words=["Hello", ",", "world", "!"], spaces=[False, True, False, False]) print([(t.text, t.text_with_ws, t.whitespace_) for t in doc]) ``` @@ -997,8 +976,8 @@ from spacy.tokens import Doc from spacy.lang.en import English nlp = English() -bad_spaces = Doc(nlp.vocab, words=[u"Hello", u",", u"world", u"!"]) -good_spaces = Doc(nlp.vocab, words=[u"Hello", u",", u"world", u"!"], +bad_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"]) +good_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"], spaces=[False, True, False, False]) print(bad_spaces.text) # 'Hello , world !' @@ -1280,7 +1259,7 @@ that yields [`Span`](/api/span) objects. import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"This is a sentence. This is another sentence.") +doc = nlp("This is a sentence. This is another sentence.") for sent in doc.sents: print(sent.text) ``` @@ -1300,7 +1279,7 @@ from spacy.lang.en import English nlp = English() # just the language with no model sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) -doc = nlp(u"This is a sentence. This is another sentence.") +doc = nlp("This is a sentence. This is another sentence.") for sent in doc.sents: print(sent.text) ``` @@ -1336,7 +1315,7 @@ take advantage of dependency-based sentence segmentation. ### {executable="true"} import spacy -text = u"this is a sentence...hello...and another sentence." +text = "this is a sentence...hello...and another sentence." nlp = spacy.load("en_core_web_sm") doc = nlp(text) diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 5df4ab458..a8a478949 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -120,7 +120,7 @@ python -m spacy download en_core_web_sm ```python import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"This is a sentence.") +doc = nlp("This is a sentence.") ``` <Infobox title="Important note" variant="warning"> @@ -197,7 +197,7 @@ nlp = spacy.load("en_core_web_sm") # load model package "en_core_web_s nlp = spacy.load("/path/to/en_core_web_sm") # load package from a directory nlp = spacy.load("en") # load model with shortcut link "en" -doc = nlp(u"This is a sentence.") +doc = nlp("This is a sentence.") ``` <Infobox title="Tip: Preview model info"> @@ -269,7 +269,7 @@ also `import` it and then call its `load()` method with no arguments: import en_core_web_sm nlp = en_core_web_sm.load() -doc = nlp(u"This is a sentence.") +doc = nlp("This is a sentence.") ``` How you choose to load your models ultimately depends on personal preference. diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 51a57d7f5..dcd182965 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -20,7 +20,7 @@ component** on the `Doc`, in order. It then returns the processed `Doc` that you can work with. ```python -doc = nlp(u"This is a text") +doc = nlp("This is a text") ``` When processing large volumes of text, the statistical models are usually more @@ -29,7 +29,7 @@ efficient if you let them work on batches of texts. spaCy's processed `Doc` objects. The batching is done internally. ```diff -texts = [u"This is a text", u"These are lots of texts", u"..."] +texts = ["This is a text", "These are lots of texts", "..."] - docs = [nlp(text) for text in texts] + docs = list(nlp.pipe(texts)) ``` @@ -172,7 +172,7 @@ which is then processed by the component next in the pipeline. ```python ### The pipeline under the hood -doc = nlp.make_doc(u"This is a sentence") # create a Doc from raw text +doc = nlp.make_doc("This is a sentence") # create a Doc from raw text for name, proc in nlp.pipeline: # iterate over components in order doc = proc(doc) # apply each component ``` @@ -263,12 +263,12 @@ blocks. ### Disable for block # 1. Use as a contextmanager with nlp.disable_pipes("tagger", "parser"): - doc = nlp(u"I won't be tagged and parsed") -doc = nlp(u"I will be tagged and parsed") + doc = nlp("I won't be tagged and parsed") +doc = nlp("I will be tagged and parsed") # 2. Restore manually disabled = nlp.disable_pipes("ner") -doc = nlp(u"I won't have named entities") +doc = nlp("I won't have named entities") disabled.restore() ``` @@ -295,11 +295,11 @@ initializing a Language class via [`from_disk`](/api/language#from_disk). ```diff - nlp = spacy.load('en', tagger=False, entity=False) -- doc = nlp(u"I don't want parsed", parse=False) +- doc = nlp("I don't want parsed", parse=False) + nlp = spacy.load("en", disable=["ner"]) + nlp.remove_pipe("parser") -+ doc = nlp(u"I don't want parsed") ++ doc = nlp("I don't want parsed") ``` </Infobox> @@ -376,7 +376,7 @@ def my_component(doc): nlp = spacy.load("en_core_web_sm") nlp.add_pipe(my_component, name="print_info", last=True) print(nlp.pipe_names) # ['tagger', 'parser', 'ner', 'print_info'] -doc = nlp(u"This is a sentence.") +doc = nlp("This is a sentence.") ``` @@ -426,14 +426,14 @@ class EntityMatcher(object): return doc nlp = spacy.load("en_core_web_sm") -terms = (u"cat", u"dog", u"tree kangaroo", u"giant sea spider") +terms = ("cat", "dog", "tree kangaroo", "giant sea spider") entity_matcher = EntityMatcher(nlp, terms, "ANIMAL") nlp.add_pipe(entity_matcher, after="ner") print(nlp.pipe_names) # The components in the pipeline -doc = nlp(u"This is a text about Barack Obama and a tree kangaroo") +doc = nlp("This is a text about Barack Obama and a tree kangaroo") print([(ent.text, ent.label_) for ent in doc.ents]) ``` @@ -471,7 +471,7 @@ def custom_sentencizer(doc): nlp = spacy.load("en_core_web_sm") nlp.add_pipe(custom_sentencizer, before="parser") # Insert before the parser -doc = nlp(u"This is. A sentence. | This is. Another sentence.") +doc = nlp("This is. A sentence. | This is. Another sentence.") for sent in doc.sents: print(sent.text) ``` @@ -517,7 +517,7 @@ config parameters are passed all the way down from components with custom settings: ```python -nlp = spacy.load("your_custom_model", terms=(u"tree kangaroo"), label="ANIMAL") +nlp = spacy.load("your_custom_model", terms=["tree kangaroo"], label="ANIMAL") ``` <Infobox title="Important note" variant="warning"> @@ -617,7 +617,7 @@ raise an `AttributeError`. ### Example from spacy.tokens import Doc, Span, Token -fruits = [u"apple", u"pear", u"banana", u"orange", u"strawberry"] +fruits = ["apple", "pear", "banana", "orange", "strawberry"] is_fruit_getter = lambda token: token.text in fruits has_fruit_getter = lambda obj: any([t.text in fruits for t in obj]) @@ -629,7 +629,7 @@ Span.set_extension("has_fruit", getter=has_fruit_getter) > #### Usage example > > ```python -> doc = nlp(u"I have an apple and a melon") +> doc = nlp("I have an apple and a melon") > assert doc[3]._.is_fruit # get Token attributes > assert not doc[0]._.is_fruit > assert doc._.has_fruit # get Doc attributes diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 1d67625a5..4c398ecd0 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -90,7 +90,7 @@ the pattern is not going to produce any results. When developing complex patterns, make sure to check examples against spaCy's tokenization: ```python -doc = nlp(u"A complex-example,!") +doc = nlp("A complex-example,!") print([token.text for token in doc]) ``` @@ -113,7 +113,7 @@ matcher = Matcher(nlp.vocab) pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}] matcher.add("HelloWorld", None, pattern) -doc = nlp(u"Hello, world! Hello world!") +doc = nlp("Hello, world! Hello world!") matches = matcher(doc) for match_id, start, end in matches: string_id = nlp.vocab.strings[match_id] # Get string representation @@ -447,7 +447,7 @@ def add_event_ent(matcher, doc, i, matches): pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}] matcher.add("GoogleIO", add_event_ent, pattern) -doc = nlp(u"This is a text about Google I/O") +doc = nlp("This is a text about Google I/O") matches = matcher(doc) ``` @@ -539,7 +539,7 @@ class BadHTMLMerger(object): nlp = spacy.load("en_core_web_sm") html_merger = BadHTMLMerger(nlp) nlp.add_pipe(html_merger, last=True) # Add component to the pipeline -doc = nlp(u"Hello<br>world! <br/> This is a test.") +doc = nlp("Hello<br>world! <br/> This is a test.") for token in doc: print(token.text, token._.bad_html) @@ -617,7 +617,7 @@ def collect_sents(matcher, doc, i, matches): pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"}, {"POS": "ADJ"}] matcher.add("FacebookIs", collect_sents, pattern) # add pattern -doc = nlp(u"I'd say that Facebook is evil. – Facebook is pretty cool, right?") +doc = nlp("I'd say that Facebook is evil. – Facebook is pretty cool, right?") matches = matcher(doc) # Serve visualization of sentences containing match with displaCy @@ -673,7 +673,7 @@ pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"}, {"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}] matcher.add("PHONE_NUMBER", None, pattern) -doc = nlp(u"Call me at (123) 456 789 or (123) 456 789!") +doc = nlp("Call me at (123) 456 789 or (123) 456 789!") print([t.text for t in doc]) matches = matcher(doc) for match_id, start, end in matches: @@ -719,8 +719,8 @@ from spacy.matcher import Matcher nlp = English() # We only want the tokenizer, so no need to load a model matcher = Matcher(nlp.vocab) -pos_emoji = [u"😀", u"😃", u"😂", u"🤣", u"😊", u"😍"] # Positive emoji -neg_emoji = [u"😞", u"😠", u"😩", u"😢", u"😭", u"😒"] # Negative emoji +pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"] # Positive emoji +neg_emoji = ["😞", "😠", "😩", "😢", "😭", "😒"] # Negative emoji # Add patterns to match one or more emoji tokens pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji] @@ -740,7 +740,7 @@ matcher.add("SAD", label_sentiment, *neg_patterns) # Add negative pattern # Add pattern for valid hashtag, i.e. '#' plus any ASCII token matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}]) -doc = nlp(u"Hello world 😀 #MondayMotivation") +doc = nlp("Hello world 😀 #MondayMotivation") matches = matcher(doc) for match_id, start, end in matches: string_id = doc.vocab.strings[match_id] # Look up string ID @@ -797,7 +797,7 @@ matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}]) # Register token extension Token.set_extension("is_hashtag", default=False) -doc = nlp(u"Hello world 😀 #MondayMotivation") +doc = nlp("Hello world 😀 #MondayMotivation") matches = matcher(doc) hashtags = [] for match_id, start, end in matches: @@ -838,13 +838,13 @@ from spacy.matcher import PhraseMatcher nlp = spacy.load('en_core_web_sm') matcher = PhraseMatcher(nlp.vocab) -terms = [u"Barack Obama", u"Angela Merkel", u"Washington, D.C."] +terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."] # Only run nlp.make_doc to speed things up patterns = [nlp.make_doc(text) for text in terms] matcher.add("TerminologyList", None, *patterns) -doc = nlp(u"German Chancellor Angela Merkel and US President Barack Obama " - u"converse in the Oval Office inside the White House in Washington, D.C.") +doc = nlp("German Chancellor Angela Merkel and US President Barack Obama " + "converse in the Oval Office inside the White House in Washington, D.C.") matches = matcher(doc) for match_id, start, end in matches: span = doc[start:end] @@ -853,8 +853,8 @@ for match_id, start, end in matches: Since spaCy is used for processing both the patterns and the text to be matched, you won't have to worry about specific tokenization – for example, you can -simply pass in `nlp(u"Washington, D.C.")` and won't have to write a complex -token pattern covering the exact tokenization of the term. +simply pass in `nlp("Washington, D.C.")` and won't have to write a complex token +pattern covering the exact tokenization of the term. <Infobox title="Important note on creating patterns" variant="warning"> @@ -889,10 +889,10 @@ from spacy.matcher import PhraseMatcher nlp = English() matcher = PhraseMatcher(nlp.vocab, attr="LOWER") -patterns = [nlp.make_doc(name) for name in [u"Angela Merkel", u"Barack Obama"]] +patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]] matcher.add("Names", None, *patterns) -doc = nlp(u"angela merkel and us president barack Obama") +doc = nlp("angela merkel and us president barack Obama") for match_id, start, end in matcher(doc): print("Matched based on lowercase token text:", doc[start:end]) ``` @@ -924,9 +924,9 @@ from spacy.matcher import PhraseMatcher nlp = English() matcher = PhraseMatcher(nlp.vocab, attr="SHAPE") -matcher.add("IP", None, nlp(u"127.0.0.1"), nlp(u"127.127.0.0")) +matcher.add("IP", None, nlp("127.0.0.1"), nlp("127.127.0.0")) -doc = nlp(u"Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.") +doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.") for match_id, start, end in matcher(doc): print("Matched based on token shape:", doc[start:end]) ``` @@ -982,7 +982,7 @@ patterns = [{"label": "ORG", "pattern": "Apple"}, ruler.add_patterns(patterns) nlp.add_pipe(ruler) -doc = nlp(u"Apple is opening its first big office in San Francisco.") +doc = nlp("Apple is opening its first big office in San Francisco.") print([(ent.text, ent.label_) for ent in doc.ents]) ``` @@ -1006,7 +1006,7 @@ patterns = [{"label": "ORG", "pattern": "MyCorp Inc."}] ruler.add_patterns(patterns) nlp.add_pipe(ruler) -doc = nlp(u"MyCorp Inc. is a company in the U.S.") +doc = nlp("MyCorp Inc. is a company in the U.S.") print([(ent.text, ent.label_) for ent in doc.ents]) ``` diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index 1ad4824fa..d592277aa 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -64,7 +64,7 @@ _then_ loads in the binary data. You can read more about this process > #### Example > > ```python -> doc = nlp(u"This is a text.") +> doc = nlp("This is a text.") > data = pickle.dumps(doc) > ``` @@ -84,8 +84,8 @@ the _same_ `Vocab` object, it will only be included once. ```python ### Pickling objects with shared data {highlight="8-9"} -doc1 = nlp(u"Hello world") -doc2 = nlp(u"This is a test") +doc1 = nlp("Hello world") +doc2 = nlp("This is a test") doc1_data = pickle.dumps(doc1) doc2_data = pickle.dumps(doc2) @@ -347,7 +347,7 @@ spaCy is now able to create the pipeline component `'snek'`: >>> nlp = English() >>> snek = nlp.create_pipe("snek") # this now works! 🐍🎉 >>> nlp.add_pipe(snek) ->>> doc = nlp(u"I am snek") +>>> doc = nlp("I am snek") --..,_ _,.--. `'.'. .'`__ o `;__. '.'. .'.'` '---'` ` @@ -497,8 +497,8 @@ If you're training a named entity recognition model for a custom domain, you may end up training different labels that don't have pre-defined colors in the [`displacy` visualizer](/usage/visualizers#ent). The `spacy_displacy_colors` entry point lets you define a dictionary of entity labels mapped to their color -values. It's added to the pre-defined colors and can also overwrite -existing values. +values. It's added to the pre-defined colors and can also overwrite existing +values. > #### Domain-specific NER labels > @@ -528,8 +528,8 @@ setup( ``` After installing the package, the the custom colors will be used when -visualizing text with `displacy`. Whenever the label `SNEK` is assigned, it -will be displayed in `#3dff74`. +visualizing text with `displacy`. Whenever the label `SNEK` is assigned, it will +be displayed in `#3dff74`. import DisplaCyEntSnekHtml from 'images/displacy-ent-snek.html' diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index 081b6d896..12d789410 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -179,7 +179,7 @@ processed `Doc`: import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion") +doc = nlp("Apple is looking at buying U.K. startup for $1 billion") for token in doc: print(token.text, token.pos_, token.dep_) ``` @@ -240,8 +240,8 @@ of a model, see the usage guides on <Infobox title="📖 Entity Linking"> -To learn more about entity linking in spaCy, and how to **train and update** -the entity linker predictions, see the usage guides on +To learn more about entity linking in spaCy, and how to **train and update** the +entity linker predictions, see the usage guides on [entity linking](/usage/linguistic-features#entity-linking) and [training the entity linker](/usage/training#entity-linker). @@ -307,8 +307,8 @@ its hash, or a hash to get its string: import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"I love coffee") -print(doc.vocab.strings[u"coffee"]) # 3197928453018144401 +doc = nlp("I love coffee") +print(doc.vocab.strings["coffee"]) # 3197928453018144401 print(doc.vocab.strings[3197928453018144401]) # 'coffee' ``` @@ -331,7 +331,7 @@ ever change. Its hash value will also always be the same. import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"I love coffee") +doc = nlp("I love coffee") for word in doc: lexeme = doc.vocab[word.text] print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_, @@ -372,14 +372,14 @@ from spacy.tokens import Doc from spacy.vocab import Vocab nlp = spacy.load("en_core_web_sm") -doc = nlp(u"I love coffee") # Original Doc -print(doc.vocab.strings[u"coffee"]) # 3197928453018144401 +doc = nlp("I love coffee") # Original Doc +print(doc.vocab.strings["coffee"]) # 3197928453018144401 print(doc.vocab.strings[3197928453018144401]) # 'coffee' 👍 empty_doc = Doc(Vocab()) # New Doc with empty Vocab # empty_doc.vocab.strings[3197928453018144401] will raise an error :( -empty_doc.vocab.strings.add(u"coffee") # Add "coffee" and generate hash +empty_doc.vocab.strings.add("coffee") # Add "coffee" and generate hash print(empty_doc.vocab.strings[3197928453018144401]) # 'coffee' 👍 new_doc = Doc(doc.vocab) # Create new doc with first doc's vocab @@ -396,20 +396,24 @@ it. ## Knowledge Base {#kb} To support the entity linking task, spaCy stores external knowledge in a -[`KnowledgeBase`](/api/kb). The knowledge base (KB) uses the `Vocab` to store its -data efficiently. +[`KnowledgeBase`](/api/kb). The knowledge base (KB) uses the `Vocab` to store +its data efficiently. > - **Mention**: A textual occurrence of a named entity, e.g. 'Miss Lovelace'. -> - **KB ID**: A unique identifier refering to a particular real-world concept, e.g. 'Q7259'. -> - **Alias**: A plausible synonym or description for a certain KB ID, e.g. 'Ada Lovelace'. -> - **Prior probability**: The probability of a certain mention resolving to a certain KB ID, -prior to knowing anything about the context in which the mention is used. -> - **Entity vector**: A pretrained word vector capturing the entity description. - -A knowledge base is created by first adding all entities to it. Next, for each -potential mention or alias, a list of relevant KB IDs and their prior probabilities -is added. The sum of these prior probabilities should never exceed 1 for any given alias. +> - **KB ID**: A unique identifier refering to a particular real-world concept, +> e.g. 'Q7259'. +> - **Alias**: A plausible synonym or description for a certain KB ID, e.g. 'Ada +> Lovelace'. +> - **Prior probability**: The probability of a certain mention resolving to a +> certain KB ID, prior to knowing anything about the context in which the +> mention is used. +> - **Entity vector**: A pretrained word vector capturing the entity +> description. +A knowledge base is created by first adding all entities to it. Next, for each +potential mention or alias, a list of relevant KB IDs and their prior +probabilities is added. The sum of these prior probabilities should never exceed +1 for any given alias. ```python ### {executable="true"} @@ -436,10 +440,10 @@ print("Number of aliases in KB:", kb.get_size_aliases()) # 2 ### Candidate generation -Given a textual entity, the Knowledge Base can provide a list of plausible candidates or -entity identifiers. The [`EntityLinker`](/api/entitylinker) will take this list of candidates -as input, and disambiguate the mention to the most probable identifier, given the -document context. +Given a textual entity, the Knowledge Base can provide a list of plausible +candidates or entity identifiers. The [`EntityLinker`](/api/entitylinker) will +take this list of candidates as input, and disambiguate the mention to the most +probable identifier, given the document context. ```python ### {executable="true"} @@ -520,11 +524,11 @@ python -m spacy download de_core_news_sm import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Hello, world. Here are two sentences.") +doc = nlp("Hello, world. Here are two sentences.") print([t.text for t in doc]) nlp_de = spacy.load("de_core_news_sm") -doc_de = nlp_de(u"Ich bin ein Berliner.") +doc_de = nlp_de("Ich bin ein Berliner.") print([t.text for t in doc_de]) ``` @@ -543,8 +547,8 @@ print([t.text for t in doc_de]) import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Peach emoji is where it has always been. Peach is the superior " - u"emoji. It's outranking eggplant 🍑 ") +doc = nlp("Peach emoji is where it has always been. Peach is the superior " + "emoji. It's outranking eggplant 🍑 ") print(doc[0].text) # 'Peach' print(doc[1].text) # 'emoji' print(doc[-1].text) # '🍑' @@ -572,7 +576,7 @@ print(sentences[1].text) # 'Peach is the superior emoji.' import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion") +doc = nlp("Apple is looking at buying U.K. startup for $1 billion") apple = doc[0] print("Fine-grained POS tag", apple.pos_, apple.pos) print("Coarse-grained POS tag", apple.tag_, apple.tag) @@ -600,20 +604,20 @@ print("Like an email address?", billion.like_email) import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"I love coffee") +doc = nlp("I love coffee") -coffee_hash = nlp.vocab.strings[u"coffee"] # 3197928453018144401 +coffee_hash = nlp.vocab.strings["coffee"] # 3197928453018144401 coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee' print(coffee_hash, coffee_text) print(doc[2].orth, coffee_hash) # 3197928453018144401 print(doc[2].text, coffee_text) # 'coffee' -beer_hash = doc.vocab.strings.add(u"beer") # 3073001599257881079 +beer_hash = doc.vocab.strings.add("beer") # 3073001599257881079 beer_text = doc.vocab.strings[beer_hash] # 'beer' print(beer_hash, beer_text) -unicorn_hash = doc.vocab.strings.add(u"🦄 ") # 18234233413267120783 -unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄 ' +unicorn_hash = doc.vocab.strings.add("🦄") # 18234233413267120783 +unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄' print(unicorn_hash, unicorn_text) ``` @@ -629,19 +633,17 @@ print(unicorn_hash, unicorn_text) ```python ### {executable="true"} import spacy - -nlp = spacy.load("en_core_web_sm") -doc = nlp(u"San Francisco considers banning sidewalk delivery robots") -for ent in doc.ents: - print(ent.text, ent.start_char, ent.end_char, ent.label_) - from spacy.tokens import Span -doc = nlp(u"FB is hiring a new VP of global policy") -doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u"ORG"])] +nlp = spacy.load("en_core_web_sm") +doc = nlp("San Francisco considers banning sidewalk delivery robots") for ent in doc.ents: print(ent.text, ent.start_char, ent.end_char, ent.label_) +doc = nlp("FB is hiring a new VP of global policy") +doc.ents = [Span(doc, 0, 1, label="ORG")] +for ent in doc.ents: + print(ent.text, ent.start_char, ent.end_char, ent.label_) ``` <Infobox> @@ -657,7 +659,7 @@ import spacy import random nlp = spacy.load("en_core_web_sm") -train_data = [(u"Uber blew through $1 million", {"entities": [(0, 4, "ORG")]})] +train_data = [("Uber blew through $1 million", {"entities": [(0, 4, "ORG")]})] other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): @@ -685,11 +687,11 @@ nlp.to_disk("/model") ```python from spacy import displacy -doc_dep = nlp(u"This is a sentence.") +doc_dep = nlp("This is a sentence.") displacy.serve(doc_dep, style="dep") -doc_ent = nlp(u"When Sebastian Thrun started working on self-driving cars at Google " - u"in 2007, few people outside of the company took him seriously.") +doc_ent = nlp("When Sebastian Thrun started working on self-driving cars at Google " + "in 2007, few people outside of the company took him seriously.") displacy.serve(doc_ent, style="ent") ``` @@ -707,7 +709,7 @@ displacy.serve(doc_ent, style="ent") import spacy nlp = spacy.load("en_core_web_md") -doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.") +doc = nlp("Apple and banana are similar. Pasta and hippo aren't.") apple = doc[0] banana = doc[2] @@ -769,7 +771,7 @@ pattern2 = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", " matcher.add("GoogleIO", None, pattern1) # Match "Google I/O" or "Google i/o" matcher.add("HAPPY", set_sentiment, *pattern2) # Match one or more happy emoji -doc = nlp(u"A text about Google I/O 😀😀") +doc = nlp("A text about Google I/O 😀😀") matches = matcher(doc) for match_id, start, end in matches: @@ -789,7 +791,7 @@ print("Sentiment", doc.sentiment) ### Minibatched stream processing {#lightning-tour-minibatched} ```python -texts = [u"One document.", u"...", u"Lots of documents"] +texts = ["One document.", "...", "Lots of documents"] # .pipe streams input, and produces streaming output iter_texts = (texts[i % 3] for i in range(100000000)) for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50)): @@ -805,8 +807,8 @@ for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50)): import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"When Sebastian Thrun started working on self-driving cars at Google " - u"in 2007, few people outside of the company took him seriously.") +doc = nlp("When Sebastian Thrun started working on self-driving cars at Google " + "in 2007, few people outside of the company took him seriously.") dep_labels = [] for token in doc: @@ -831,7 +833,7 @@ import spacy from spacy.attrs import ORTH, LIKE_URL nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Check out https://spacy.io") +doc = nlp("Check out https://spacy.io") for token in doc: print(token.text, token.orth, token.like_url) @@ -877,7 +879,7 @@ def put_spans_around_tokens(doc): nlp = spacy.load("en_core_web_sm") -doc = nlp(u"This is a test.\\n\\nHello world.") +doc = nlp("This is a test.\\n\\nHello world.") html = put_spans_around_tokens(doc) print(html) ``` diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 9489615bc..f84fd0ed4 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -298,10 +298,10 @@ imports. It also makes it easier to structure and load your training data. ```python ### Simple training loop TRAIN_DATA = [ - (u"Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}), - (u"Google rebrands its business apps", {"entities": [(0, 6, "ORG")]})] + ("Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}), + ("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]})] -nlp = spacy.blank('en') +nlp = spacy.blank("en") optimizer = nlp.begin_training() for i in range(20): random.shuffle(TRAIN_DATA) @@ -498,7 +498,7 @@ like this:  ```python -doc = nlp(u"find a hotel with good wifi") +doc = nlp("find a hotel with good wifi") print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-']) # [('find', 'ROOT', 'find'), ('hotel', 'PLACE', 'find'), # ('good', 'QUALITY', 'wifi'), ('wifi', 'ATTRIBUTE', 'hotel')] diff --git a/website/docs/usage/v2-1.md b/website/docs/usage/v2-1.md index d3c9fb504..4a8ef5a37 100644 --- a/website/docs/usage/v2-1.md +++ b/website/docs/usage/v2-1.md @@ -99,8 +99,8 @@ flexibility. > > ```python > matcher = PhraseMatcher(nlp.vocab, attr="POS") -> matcher.add("PATTERN", None, nlp(u"I love cats")) -> doc = nlp(u"You like dogs") +> matcher.add("PATTERN", None, nlp("I love cats")) +> doc = nlp("You like dogs") > matches = matcher(doc) > ``` @@ -122,9 +122,9 @@ or `POS` for finding sequences of the same part-of-speech tags. > #### Example > > ```python -> doc = nlp(u"I like David Bowie") +> doc = nlp("I like David Bowie") > with doc.retokenize() as retokenizer: -> attrs = {"LEMMA": u"David Bowie"} +> attrs = {"LEMMA": "David Bowie"} > retokenizer.merge(doc[2:4], attrs=attrs) > ``` diff --git a/website/docs/usage/v2.md b/website/docs/usage/v2.md index 9e54106c7..d7011fb2d 100644 --- a/website/docs/usage/v2.md +++ b/website/docs/usage/v2.md @@ -156,7 +156,7 @@ spaCy or plug in your own machine learning models. > for itn in range(100): > for doc, gold in train_data: > nlp.update([doc], [gold]) -> doc = nlp(u"This is a text.") +> doc = nlp("This is a text.") > print(doc.cats) > ``` @@ -179,13 +179,13 @@ network to assign position-sensitive vectors to each word in the document. > #### Example > > ```python -> doc = nlp(u"I love coffee") -> assert doc.vocab.strings[u"coffee"] == 3197928453018144401 -> assert doc.vocab.strings[3197928453018144401] == u"coffee" +> doc = nlp("I love coffee") +> assert doc.vocab.strings["coffee"] == 3197928453018144401 +> assert doc.vocab.strings[3197928453018144401] == "coffee" > -> beer_hash = doc.vocab.strings.add(u"beer") -> assert doc.vocab.strings[u"beer"] == beer_hash -> assert doc.vocab.strings[beer_hash] == u"beer" +> beer_hash = doc.vocab.strings.add("beer") +> assert doc.vocab.strings["beer"] == beer_hash +> assert doc.vocab.strings[beer_hash] == "beer" > ``` The [`StringStore`](/api/stringstore) now resolves all strings to hash values @@ -275,7 +275,7 @@ language, you can import the class directly, e.g. > > ```python > from spacy import displacy -> doc = nlp(u"This is a sentence about Facebook.") +> doc = nlp("This is a sentence about Facebook.") > displacy.serve(doc, style="dep") # run the web server > html = displacy.render(doc, style="ent") # generate HTML > ``` @@ -322,7 +322,7 @@ lookup-based lemmatization – and **many new languages**! > matcher.add('HEARTS', None, [{"ORTH": "❤️", "OP": '+'}]) > > phrasematcher = PhraseMatcher(nlp.vocab) -> phrasematcher.add("OBAMA", None, nlp(u"Barack Obama")) +> phrasematcher.add("OBAMA", None, nlp("Barack Obama")) > ``` Patterns can now be added to the matcher by calling @@ -477,12 +477,12 @@ to the `disable` keyword argument on load, or by using [`disable_pipes`](/api/language#disable_pipes) as a method or context manager: ```diff -- nlp = spacy.load("en", tagger=False, entity=False) -- doc = nlp(u"I don't want parsed", parse=False) +- nlp = spacy.load("en_core_web_sm", tagger=False, entity=False) +- doc = nlp("I don't want parsed", parse=False) -+ nlp = spacy.load("en", disable=["tagger", "ner"]) ++ nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner"]) + with nlp.disable_pipes("parser"): -+ doc = nlp(u"I don't want parsed") ++ doc = nlp("I don't want parsed") ``` To add spaCy's built-in pipeline components to your pipeline, you can still @@ -539,7 +539,7 @@ This means that your application can – and should – only pass around `Doc` objects and refer to them as the single source of truth. ```diff -- doc = nlp(u"This is a regular doc") +- doc = nlp("This is a regular doc") - doc_array = doc.to_array(["ORTH", "POS"]) - doc_with_meta = {"doc_array": doc_array, "meta": get_doc_meta(doc_array)} @@ -556,11 +556,11 @@ utilities that interact with the pipeline, consider moving this logic into its own extension module. ```diff -- doc = nlp(u"Doc with a standard pipeline") +- doc = nlp("Doc with a standard pipeline") - meta = get_meta(doc) + nlp.add_pipe(meta_component) -+ doc = nlp(u"Doc with a custom pipeline that assigns meta") ++ doc = nlp("Doc with a custom pipeline that assigns meta") + meta = doc._.meta ``` @@ -572,12 +572,12 @@ to call [`StringStore.add`](/api/stringstore#add) explicitly. You can also now be sure that the string-to-hash mapping will always match across vocabularies. ```diff -- nlp.vocab.strings[u"coffee"] # 3672 -- other_nlp.vocab.strings[u"coffee"] # 40259 +- nlp.vocab.strings["coffee"] # 3672 +- other_nlp.vocab.strings["coffee"] # 40259 -+ nlp.vocab.strings.add(u"coffee") -+ nlp.vocab.strings[u"coffee"] # 3197928453018144401 -+ other_nlp.vocab.strings[u"coffee"] # 3197928453018144401 ++ nlp.vocab.strings.add("coffee") ++ nlp.vocab.strings["coffee"] # 3197928453018144401 ++ other_nlp.vocab.strings["coffee"] # 3197928453018144401 ``` ### Adding patterns and callbacks to the matcher {#migrating-matcher} diff --git a/website/docs/usage/vectors-similarity.md b/website/docs/usage/vectors-similarity.md index f7c9d1cd9..53648f66e 100644 --- a/website/docs/usage/vectors-similarity.md +++ b/website/docs/usage/vectors-similarity.md @@ -74,8 +74,8 @@ path to [`spacy.load()`](/api/top-level#spacy.load). ```python nlp_latin = spacy.load("/tmp/la_vectors_wiki_lg") -doc1 = nlp_latin(u"Caecilius est in horto") -doc2 = nlp_latin(u"servus est in atrio") +doc1 = nlp_latin("Caecilius est in horto") +doc2 = nlp_latin("servus est in atrio") doc1.similarity(doc2) ``` @@ -168,10 +168,9 @@ vectors to the vocabulary, you can use the ### Adding vectors from spacy.vocab import Vocab -vector_data = {u"dog": numpy.random.uniform(-1, 1, (300,)), - u"cat": numpy.random.uniform(-1, 1, (300,)), - u"orange": numpy.random.uniform(-1, 1, (300,))} - +vector_data = {"dog": numpy.random.uniform(-1, 1, (300,)), + "cat": numpy.random.uniform(-1, 1, (300,)), + "orange": numpy.random.uniform(-1, 1, (300,))} vocab = Vocab() for word, vector in vector_data.items(): vocab.set_vector(word, vector) @@ -241,7 +240,7 @@ import cupy.cuda from spacy.vectors import Vectors vector_table = numpy.zeros((3, 300), dtype="f") -vectors = Vectors([u"dog", u"cat", u"orange"], vector_table) +vectors = Vectors(["dog", "cat", "orange"], vector_table) with cupy.cuda.Device(0): vectors.data = cupy.asarray(vectors.data) ``` @@ -252,6 +251,6 @@ import torch from spacy.vectors import Vectors vector_table = numpy.zeros((3, 300), dtype="f") -vectors = Vectors([u"dog", u"cat", u"orange"], vector_table) +vectors = Vectors(["dog", "cat", "orange"], vector_table) vectors.data = torch.Tensor(vectors.data).cuda(0) ``` diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md index 6172d2f48..dd0b0eb50 100644 --- a/website/docs/usage/visualizers.md +++ b/website/docs/usage/visualizers.md @@ -48,7 +48,7 @@ import spacy from spacy import displacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"This is a sentence.") +doc = nlp("This is a sentence.") displacy.serve(doc, style="dep") ``` @@ -101,7 +101,7 @@ import spacy from spacy import displacy nlp = spacy.load("en_core_web_sm") -text = u"""In ancient Rome, some neighbors live in three adjacent houses. In the center is the house of Senex, who lives there with wife Domina, son Hero, and several slaves, including head slave Hysterium and the musical's main character Pseudolus. A slave belonging to Hero, Pseudolus wishes to buy, win, or steal his freedom. One of the neighboring houses is owned by Marcus Lycus, who is a buyer and seller of beautiful women; the other belongs to the ancient Erronius, who is abroad searching for his long-lost children (stolen in infancy by pirates). One day, Senex and Domina go on a trip and leave Pseudolus in charge of Hero. Hero confides in Pseudolus that he is in love with the lovely Philia, one of the courtesans in the House of Lycus (albeit still a virgin).""" +text = """In ancient Rome, some neighbors live in three adjacent houses. In the center is the house of Senex, who lives there with wife Domina, son Hero, and several slaves, including head slave Hysterium and the musical's main character Pseudolus. A slave belonging to Hero, Pseudolus wishes to buy, win, or steal his freedom. One of the neighboring houses is owned by Marcus Lycus, who is a buyer and seller of beautiful women; the other belongs to the ancient Erronius, who is abroad searching for his long-lost children (stolen in infancy by pirates). One day, Senex and Domina go on a trip and leave Pseudolus in charge of Hero. Hero confides in Pseudolus that he is in love with the lovely Philia, one of the courtesans in the House of Lycus (albeit still a virgin).""" doc = nlp(text) sentence_spans = list(doc.sents) displacy.serve(sentence_spans, style="dep") @@ -117,7 +117,7 @@ text. import spacy from spacy import displacy -text = u"When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously." +text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously." nlp = spacy.load("en_core_web_sm") doc = nlp(text) @@ -168,7 +168,7 @@ add a headline to each visualization, you can add a `title` to its `user_data`. User data is never touched or modified by spaCy. ```python -doc = nlp(u"This is a sentence about Google.") +doc = nlp("This is a sentence about Google.") doc.user_data["title"] = "This is a title" displacy.serve(doc, style="ent") ``` @@ -193,7 +193,7 @@ import spacy from spacy import displacy # In[2]: -doc = nlp(u"Rats are various medium-sized, long-tailed rodents.") +doc = nlp("Rats are various medium-sized, long-tailed rodents.") displacy.render(doc, style="dep") # In[3]: @@ -209,7 +209,6 @@ rendering if auto-detection fails. </Infobox> -  Internally, displaCy imports `display` and `HTML` from `IPython.core.display` @@ -236,8 +235,8 @@ import spacy from spacy import displacy nlp = spacy.load("en_core_web_sm") -doc1 = nlp(u"This is a sentence.") -doc2 = nlp(u"This is another sentence.") +doc1 = nlp("This is a sentence.") +doc2 = nlp("This is another sentence.") html = displacy.render([doc1, doc2], style="dep", page=True) ``` @@ -281,7 +280,7 @@ from spacy import displacy from pathlib import Path nlp = spacy.load("en_core_web_sm") -sentences = [u"This is an example.", u"This is another one."] +sentences = ["This is an example.", "This is another one."] for sent in sentences: doc = nlp(sent) svg = displacy.render(doc, style="dep", jupyter=False) diff --git a/website/meta/universe.json b/website/meta/universe.json index 2997f9300..f9dae7ead 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -119,14 +119,14 @@ "emoji = Emoji(nlp)", "nlp.add_pipe(emoji, first=True)", "", - "doc = nlp(u'This is a test 😻 👍🏿')", + "doc = nlp('This is a test 😻 👍🏿')", "assert doc._.has_emoji == True", "assert doc[2:5]._.has_emoji == True", "assert doc[0]._.is_emoji == False", "assert doc[4]._.is_emoji == True", - "assert doc[5]._.emoji_desc == u'thumbs up dark skin tone'", + "assert doc[5]._.emoji_desc == 'thumbs up dark skin tone'", "assert len(doc._.emoji) == 2", - "assert doc._.emoji[1] == (u'👍🏿', 5, u'thumbs up dark skin tone')" + "assert doc._.emoji[1] == ('👍🏿', 5, 'thumbs up dark skin tone')" ], "author": "Ines Montani", "author_links": { @@ -747,8 +747,8 @@ "s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0')", "nlp.add_pipe(s2v)", "", - "doc = nlp(u\"A sentence about natural language processing.\")", - "assert doc[3].text == u'natural language processing'", + "doc = nlp(\"A sentence about natural language processing.\")", + "assert doc[3].text == 'natural language processing'", "freq = doc[3]._.s2v_freq", "vector = doc[3]._.s2v_vec", "most_similar = doc[3]._.s2v_most_similar(3)", @@ -1297,7 +1297,7 @@ "", "nlp = spacy.load('en')", "nlp.add_pipe(BeneparComponent('benepar_en'))", - "doc = nlp(u'The time for action is now. It's never too late to do something.')", + "doc = nlp('The time for action is now. It's never too late to do something.')", "sent = list(doc.sents)[0]", "print(sent._.parse_string)", "# (S (NP (NP (DT The) (NN time)) (PP (IN for) (NP (NN action)))) (VP (VBZ is) (ADVP (RB now))) (. .))", diff --git a/website/src/widgets/quickstart-models.js b/website/src/widgets/quickstart-models.js index 83bb4527b..d116fae0a 100644 --- a/website/src/widgets/quickstart-models.js +++ b/website/src/widgets/quickstart-models.js @@ -65,7 +65,7 @@ const QuickstartInstall = ({ id, title, description, defaultLang, children }) => nlp = {pkg}.load() </QS> <QS lang={code} config="example" prompt="python"> - doc = nlp(u"{exampleText}") + doc = nlp("{exampleText}") </QS> <QS lang={code} config="example" prompt="python"> print([ From 25b2b3ff4527f1fba5a2df0756cbe4f843b469a7 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 12 Sep 2019 16:26:27 +0200 Subject: [PATCH 149/207] Remove LEMMA from exception examples [ci skip] --- website/docs/api/tokenizer.md | 4 ++-- website/docs/api/top-level.md | 4 ++-- website/docs/usage/linguistic-features.md | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 63c1e87ea..d6ab73f14 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -118,8 +118,8 @@ and examples. > #### Example > > ```python -> from spacy.attrs import ORTH, LEMMA -> case = [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}] +> from spacy.attrs import ORTH, NORM +> case = [{ORTH: "do"}, {ORTH: "n't", NORM: "not"}] > tokenizer.add_special_case("don't", case) > ``` diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 0a8f638b2..50ba0e3d9 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -514,9 +514,9 @@ an error if key doesn't match `ORTH` values. > > ```python > BASE = {"a.": [{ORTH: "a."}], ":)": [{ORTH: ":)"}]} -> NEW = {"a.": [{ORTH: "a.", LEMMA: "all"}]} +> NEW = {"a.": [{ORTH: "a.", NORM: "all"}]} > exceptions = util.update_exc(BASE, NEW) -> # {"a.": [{ORTH: "a.", LEMMA: "all"}], ":)": [{ORTH: ":)"}]} +> # {"a.": [{ORTH: "a.", NORM: "all"}], ":)": [{ORTH: ":)"}]} > ``` | Name | Type | Description | diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index a91135d70..7549a3985 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -649,7 +649,7 @@ import Tokenization101 from 'usage/101/\_tokenization.md' data in [`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang). The tokenizer exceptions define special cases like "don't" in English, which needs -to be split into two tokens: `{ORTH: "do"}` and `{ORTH: "n't", LEMMA: "not"}`. +to be split into two tokens: `{ORTH: "do"}` and `{ORTH: "n't", NORM: "not"}`. The prefixes, suffixes and infixes mostly define punctuation rules – for example, when to split off periods (at the end of a sentence), and when to leave tokens containing periods intact (abbreviations like "U.S."). From ff51fba96a22185bade05f807add56dc1530adbc Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 12 Sep 2019 16:26:33 +0200 Subject: [PATCH 150/207] Update lemmaitzer docs [ci skip] --- website/docs/api/lemmatizer.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index fd71d16cf..8b6d9dcf6 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -35,8 +35,8 @@ Lemmatize a string. > > ```python > from spacy.lemmatizer import Lemmatizer -> from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES -> lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) +> rules = {"noun": [["s", ""]]} +> lemmatizer = Lemmatizer(index={}, exceptions={}, rules=rules) > lemmas = lemmatizer("ducks", "NOUN") > assert lemmas == ["duck"] > ``` From 7d782aa97b6a027a574bef491cb50cd3c3e2707d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 12 Sep 2019 16:48:10 +0200 Subject: [PATCH 151/207] Add more docstrings for MorphAnalysis --- spacy/tokens/morphanalysis.pyx | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index 17b11d84f..e09870741 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -20,6 +20,7 @@ cdef class MorphAnalysis: @classmethod def from_id(cls, Vocab vocab, hash_t key): + """Create a morphological analysis from a given ID.""" cdef MorphAnalysis morph = MorphAnalysis.__new__(MorphAnalysis, vocab) morph.vocab = vocab morph.key = key @@ -31,15 +32,18 @@ cdef class MorphAnalysis: return morph def __contains__(self, feature): + """Test whether the morphological analysis contains some feature.""" cdef attr_t feat_id = get_string_id(feature) return check_feature(&self.c, feat_id) def __iter__(self): + """Iterate over the features in the analysis.""" cdef attr_t feature for feature in list_features(&self.c): yield self.vocab.strings[feature] def __len__(self): + """The number of features in the analysis.""" return self.c.length def __str__(self): @@ -52,10 +56,14 @@ cdef class MorphAnalysis: return self.key def get(self, unicode field): + """Retrieve a feature by field.""" cdef int field_id = self.vocab.morphology._feat_map.attr2field[field] return self.vocab.strings[get_field(&self.c, field_id)] def to_json(self): + """Produce a json serializable representation, which will be a list of + strings. + """ return tag_to_json(&self.c) @property From 7d8df6915865b76d6ad44f6c8bbd7e6ddcd1fa06 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann <polm@dampfkraft.com> Date: Fri, 13 Sep 2019 00:26:11 +0900 Subject: [PATCH 152/207] Bloom-filter backed Lookup Tables (#4268) * Improve load_language_data helper * WIP: Add Lookups implementation * Start moving lemma data over to JSON * WIP: move data over for more languages * Convert more languages * Fix lemmatizer fixtures in tests * Finish conversion * Auto-format JSON files * Fix test for now * Make sure tables are stored on instance * Update docstrings * Update docstrings and errors * Update test * Add Lookups.__len__ * Add serialization methods * Add Lookups.remove_table * Use msgpack for serialization to disk * Fix file exists check * Try using OrderedDict for everything * Update .flake8 [ci skip] * Try fixing serialization * Update test_lookups.py * Update test_serialize_vocab_strings.py * Lookups / Tables now work This implements the stubs in the Lookups/Table classes. Currently this is in Cython but with no type declarations, so that could be improved. * Add lookups to setup.py * Actually add lookups pyx The previous commit added the old py file... * Lookups work-in-progress * Move from pyx back to py * Add string based lookups, fix serialization * Update tests, language/lemmatizer to work with string lookups There are some outstanding issues here: - a pickling-related test fails due to the bloom filter - some custom lemmatizers (fr/nl at least) have issues More generally, there's a question of how to deal with the case where you have a string but want to use the lookup table. Currently the table allows access by string or id, but that's getting pretty awkward. * Change lemmatizer lookup method to pass (orth, string) * Fix token lookup * Fix French lookup * Fix lt lemmatizer test * Fix Dutch lemmatizer * Fix lemmatizer lookup test This was using a normal dict instead of a Table, so checks for the string instead of an integer key failed. * Make uk/nl/ru lemmatizer lookup methods consistent The mentioned tokenizers all have their own implementation of the `lookup` method, which accesses a `Lookups` table. The way that was called in `token.pyx` was changed so this should be updated to have the same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id, string)). Prior to this change tests weren't failing, but there would probably be issues with normal use of a model. More tests should proably be added. Additionally, the language-specific `lookup` implementations seem like they might not be needed, since they handle things like lower-casing that aren't actually language specific. * Make recently added Greek method compatible * Remove redundant class/method Leftovers from a merge not cleaned up adequately. --- spacy/lang/el/lemmatizer/__init__.py | 6 +- spacy/lang/fr/lemmatizer/__init__.py | 8 +- spacy/lang/nl/lemmatizer/__init__.py | 17 ++-- spacy/lang/ru/lemmatizer.py | 2 +- spacy/lang/uk/lemmatizer.py | 2 +- spacy/language.py | 1 + spacy/lemmatizer.py | 6 +- spacy/lookups.py | 99 +++++++++++++++++++---- spacy/morphology.pyx | 2 +- spacy/tests/doc/test_creation.py | 4 +- spacy/tests/lang/lt/test_lemmatizer.py | 2 +- spacy/tests/lang/nl/test_lemmatizer.py | 4 +- spacy/tests/vocab_vectors/test_lookups.py | 20 ++--- spacy/tokens/token.pyx | 4 +- spacy/vocab.pyx | 2 +- 15 files changed, 126 insertions(+), 53 deletions(-) diff --git a/spacy/lang/el/lemmatizer/__init__.py b/spacy/lang/el/lemmatizer/__init__.py index c0ce5c2ad..bc5c00bd8 100644 --- a/spacy/lang/el/lemmatizer/__init__.py +++ b/spacy/lang/el/lemmatizer/__init__.py @@ -46,9 +46,9 @@ class GreekLemmatizer(object): ) return lemmas - def lookup(self, string): - if string in self.lookup_table: - return self.lookup_table[string] + def lookup(self, orth, string): + if orth in self.lookup_table: + return self.lookup_table[orth] return string diff --git a/spacy/lang/fr/lemmatizer/__init__.py b/spacy/lang/fr/lemmatizer/__init__.py index a0a0d2021..879f2c80c 100644 --- a/spacy/lang/fr/lemmatizer/__init__.py +++ b/spacy/lang/fr/lemmatizer/__init__.py @@ -52,7 +52,7 @@ class FrenchLemmatizer(object): elif univ_pos in (SCONJ, "SCONJ", "sconj"): univ_pos = "sconj" else: - return [self.lookup(string)] + return [self.lookup(None, string)] # See Issue #435 for example of where this logic is requied. if self.is_base_form(univ_pos, morphology): return list(set([string.lower()])) @@ -114,9 +114,9 @@ class FrenchLemmatizer(object): def punct(self, string, morphology=None): return self(string, "punct", morphology) - def lookup(self, string): - if string in self.lookup_table: - return self.lookup_table[string][0] + def lookup(self, orth, string): + if orth is not None and orth in self.lookup_table: + return self.lookup_table[orth][0] return string diff --git a/spacy/lang/nl/lemmatizer/__init__.py b/spacy/lang/nl/lemmatizer/__init__.py index 1e5d9aa1f..db345c088 100644 --- a/spacy/lang/nl/lemmatizer/__init__.py +++ b/spacy/lang/nl/lemmatizer/__init__.py @@ -62,25 +62,25 @@ class DutchLemmatizer(object): # are not lemmatized. They are lowercased, however. return [string] # if string in self.lemma_index.get(univ_pos) - lemma_index = self.index.get(univ_pos, {}) + lemma_index = self.index.get_string(univ_pos, {}) # string is already lemma if string in lemma_index: return [string] - exceptions = self.exc.get(univ_pos, {}) + exceptions = self.exc.get_string(univ_pos, {}) # string is irregular token contained in exceptions index. try: lemma = exceptions[string] return [lemma[0]] except KeyError: pass - # string corresponds to key in lookup table + # string corresponds to key in lookup table lookup_table = self.lookup_table - looked_up_lemma = lookup_table.get(string) + looked_up_lemma = lookup_table.get_string(string) if looked_up_lemma and looked_up_lemma in lemma_index: return [looked_up_lemma] forms, is_known = lemmatize( - string, lemma_index, exceptions, self.rules.get(univ_pos, []) + string, lemma_index, exceptions, self.rules.get_string(univ_pos, []) ) # Back-off through remaining return value candidates. @@ -103,9 +103,12 @@ class DutchLemmatizer(object): # Overrides parent method so that a lowercased version of the string is # used to search the lookup table. This is necessary because our lookup # table consists entirely of lowercase keys. - def lookup(self, string): + def lookup(self, orth, string): string = string.lower() - return self.lookup_table.get(string, string) + if orth is not None: + return self.lookup_table.get(orth, string) + else: + return self.lookup_table.get_string(string, string) def noun(self, string, morphology=None): return self(string, "noun", morphology) diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 300d61c52..9fc600eb8 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -115,7 +115,7 @@ class RussianLemmatizer(Lemmatizer): def pron(self, string, morphology=None): return self(string, "pron", morphology) - def lookup(self, string): + def lookup(self, orth, string): analyses = self._morph.parse(string) if len(analyses) == 1: return analyses[0].normal_form diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index ab56c824d..ea2c32ee3 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -112,7 +112,7 @@ class UkrainianLemmatizer(Lemmatizer): def pron(self, string, morphology=None): return self(string, "pron", morphology) - def lookup(self, string): + def lookup(self, orth, string): analyses = self._morph.parse(string) if len(analyses) == 1: return analyses[0].normal_form diff --git a/spacy/language.py b/spacy/language.py index 7292e3bf6..f966a6630 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -32,6 +32,7 @@ from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tag_map import TAG_MAP from .lang.lex_attrs import LEX_ATTRS, is_stop from .errors import Errors, Warnings, deprecation_warning +from .strings import hash_string from . import util from . import about diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index d14f5292e..cfedd7a9d 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -93,9 +93,9 @@ class Lemmatizer(object): def punct(self, string, morphology=None): return self(string, "punct", morphology) - def lookup(self, string): - if string in self.lookup_table: - return self.lookup_table[string] + def lookup(self, orth, string): + if orth in self.lookup_table: + return self.lookup_table[orth] return string diff --git a/spacy/lookups.py b/spacy/lookups.py index a6fa7abff..b3b67ae7b 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -1,4 +1,4 @@ -# coding: utf8 +# coding: utf-8 from __future__ import unicode_literals import srsly @@ -6,7 +6,12 @@ from collections import OrderedDict from .errors import Errors from .util import SimpleFrozenDict, ensure_path +from .strings import hash_string +from . import util + +import srsly +from preshed.bloom import BloomFilter class Lookups(object): """Container for large lookup tables and dictionaries, e.g. lemmatization @@ -14,10 +19,6 @@ class Lookups(object): so they can be accessed before the pipeline components are applied (e.g. in the tokenizer and lemmatizer), as well as within the pipeline components via doc.vocab.lookups. - - Important note: At the moment, this class only performs a very basic - dictionary lookup. We're planning to replace this with a more efficient - implementation. See #3971 for details. """ def __init__(self): @@ -54,8 +55,7 @@ class Lookups(object): """ if name in self.tables: raise ValueError(Errors.E158.format(name=name)) - table = Table(name=name) - table.update(data) + table = Table(name=name, data=data) self._tables[name] = table return table @@ -100,10 +100,9 @@ class Lookups(object): bytes_data (bytes): The data to load. RETURNS (Lookups): The loaded Lookups. """ - self._tables = OrderedDict() - msg = srsly.msgpack_loads(bytes_data) - for key, value in msg.items(): - self._tables[key] = Table.from_dict(value) + for key, value in srsly.msgpack_loads(bytes_data).items(): + self._tables[key] = Table(key) + self._tables[key].update_raw(value) return self def to_disk(self, path, **kwargs): @@ -137,8 +136,10 @@ class Lookups(object): class Table(OrderedDict): - """A table in the lookups. Subclass of OrderedDict that implements a - slightly more consistent and unified API. + """A table in the lookups. Subclass of builtin dict that implements a + slightly more consistent and unified API. + + Includes a Bloom filter to speed up missed lookups. """ @classmethod @@ -153,15 +154,81 @@ class Table(OrderedDict): self.update(data) return self - def __init__(self, name=None): + def __init__(self, name=None, data=None): """Initialize a new table. name (unicode): Optional table name for reference. + data (dict): Initial data, used to hint Bloom Filter. RETURNS (Table): The newly created object. """ OrderedDict.__init__(self) self.name = name + # assume a default size of 1M items + size = 1E6 + if data and len(data) > 0: + size = len(data) - def set(self, key, value): - """Set new key/value pair. Same as table[key] = value.""" + self.bloom = BloomFilter.from_error_rate(size) + + if data: + self.update(data) + + def set(self, key, value): + """Set new key/value pair, where key is an integer. Same as + table[key] = value. + """ self[key] = value + + def __setitem__(self, key, value): + OrderedDict.__setitem__(self, key, value) + self.bloom.add(key) + + def set_string(self, key, value): + """Set new key/value pair, where key is a string to be hashed. + """ + hkey = hash_string(key) + self.set(hkey, value) + + def update(self, data): + """Add entries in a dict-like to the table, where keys are strings to + be hashed. + """ + for key, val in data.items(): + self.set_string(key, val) + + def update_raw(self, data): + """Add entries in a dict-like to the table, where keys are ints. + """ + for key, val in data.items(): + self.set(key, val) + + def get(self, key, default=None): + return OrderedDict.get(self, key, default) + + def get_string(self, key, default=None): + hkey = hash_string(key) + return OrderedDict.get(self, hkey, default) + + def __contains__(self, key): + # This can give a false positive, so we need to check it after + if key not in self.bloom: + return False + return OrderedDict.__contains__(self, key) + + def contains_string(self, key): + hkey = hash_string(key) + return self.__contains__(hkey) + + def to_bytes(self): + # TODO: serialize bloom too. For now just reconstruct it. + return srsly.msgpack_dumps({'name': self.name, 'dict': dict(self.items())}) + + def from_bytes(self, data): + loaded = srsly.msgpack_loads(data) + self.name = loaded['name'] + for key, val in loaded['dict'].items(): + self[key] = val + self.bloom.add(key) + + return self + diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index bf7aaced0..8cc27fb7d 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -273,7 +273,7 @@ cdef class Morphology: """ if token.lemma == 0: orth_str = self.strings[token.lex.orth] - lemma = self.lemmatizer.lookup(orth_str) + lemma = self.lemmatizer.lookup(token.lex.orth, orth_str) token.lemma = self.strings.add(lemma) cdef int assign_tag(self, TokenC* token, tag_str) except -1: diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py index ce42b39b9..b222f6bf0 100644 --- a/spacy/tests/doc/test_creation.py +++ b/spacy/tests/doc/test_creation.py @@ -5,11 +5,13 @@ import pytest from spacy.vocab import Vocab from spacy.tokens import Doc from spacy.lemmatizer import Lemmatizer +from spacy.lookups import Table @pytest.fixture def lemmatizer(): - return Lemmatizer(lookup={"dogs": "dog", "boxen": "box", "mice": "mouse"}) + lookup = Table(data={"dogs": "dog", "boxen": "box", "mice": "mouse"}) + return Lemmatizer(lookup=lookup) @pytest.fixture diff --git a/spacy/tests/lang/lt/test_lemmatizer.py b/spacy/tests/lang/lt/test_lemmatizer.py index 9b2969849..5c3ed34f8 100644 --- a/spacy/tests/lang/lt/test_lemmatizer.py +++ b/spacy/tests/lang/lt/test_lemmatizer.py @@ -17,4 +17,4 @@ TEST_CASES = [ @pytest.mark.parametrize("tokens,lemmas", TEST_CASES) def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas): - assert lemmas == [lt_lemmatizer.lookup(token) for token in tokens] + assert lemmas == [lt_lemmatizer.lookup_table.get_string(token, token) for token in tokens] diff --git a/spacy/tests/lang/nl/test_lemmatizer.py b/spacy/tests/lang/nl/test_lemmatizer.py index dae9091b7..93dd1e5e3 100644 --- a/spacy/tests/lang/nl/test_lemmatizer.py +++ b/spacy/tests/lang/nl/test_lemmatizer.py @@ -133,11 +133,11 @@ def test_nl_lemmatizer_pronoun_lemmas(nl_lemmatizer, text, lemma): # Using the lemma lookup table only @pytest.mark.parametrize("text,lemma", noun_irreg_lemmatization_cases) def test_nl_lemmatizer_lookup_noun(nl_lemmatizer, text, lemma): - lemma_pred = nl_lemmatizer.lookup(text) + lemma_pred = nl_lemmatizer.lookup(None, text) assert lemma_pred in (lemma, text) @pytest.mark.parametrize("text,lemma", verb_irreg_lemmatization_cases) def test_nl_lemmatizer_lookup_verb(nl_lemmatizer, text, lemma): - lemma_pred = nl_lemmatizer.lookup(text) + lemma_pred = nl_lemmatizer.lookup(None, text) assert lemma_pred in (lemma, text) diff --git a/spacy/tests/vocab_vectors/test_lookups.py b/spacy/tests/vocab_vectors/test_lookups.py index 16ffe83fc..7cdf8ff68 100644 --- a/spacy/tests/vocab_vectors/test_lookups.py +++ b/spacy/tests/vocab_vectors/test_lookups.py @@ -19,9 +19,9 @@ def test_lookups_api(): table = lookups.get_table(table_name) assert table.name == table_name assert len(table) == 2 - assert table.get("hello") == "world" - table.set("a", "b") - assert table.get("a") == "b" + assert table.get_string("hello") == "world" + table.set_string("a", "b") + assert table.get_string("a") == "b" table = lookups.get_table(table_name) assert len(table) == 3 with pytest.raises(KeyError): @@ -50,10 +50,10 @@ def test_lookups_to_from_bytes(): assert "table2" in new_lookups table1 = new_lookups.get_table("table1") assert len(table1) == 2 - assert table1.get("foo") == "bar" + assert table1.get_string("foo") == "bar" table2 = new_lookups.get_table("table2") assert len(table2) == 3 - assert table2.get("b") == 2 + assert table2.get_string("b") == 2 assert new_lookups.to_bytes() == lookups_bytes @@ -72,10 +72,11 @@ def test_lookups_to_from_disk(): assert "table2" in new_lookups table1 = new_lookups.get_table("table1") assert len(table1) == 2 - assert table1.get("foo") == "bar" + assert table1.get_string("foo") == "bar" table2 = new_lookups.get_table("table2") assert len(table2) == 3 - assert table2.get("b") == 2 + assert table2.get_string("b") == 2 + # This fails on Python 3.5 @@ -93,10 +94,9 @@ def test_lookups_to_from_bytes_via_vocab(): assert table_name in new_vocab.lookups table = new_vocab.lookups.get_table(table_name) assert len(table) == 2 - assert table.get("hello") == "world" + assert table.get_string("hello") == "world" assert new_vocab.to_bytes() == vocab_bytes - # This fails on Python 3.5 @pytest.mark.xfail def test_lookups_to_from_disk_via_vocab(): @@ -113,4 +113,4 @@ def test_lookups_to_from_disk_via_vocab(): assert table_name in new_vocab.lookups table = new_vocab.lookups.get_table(table_name) assert len(table) == 2 - assert table.get("hello") == "world" + assert table.get_string("hello") == "world" diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index c7a44f5ca..dfe42d2bd 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -335,7 +335,7 @@ cdef class Token: """ def __get__(self): if self.c.lemma == 0: - lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth_) + lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth, self.orth_) return self.vocab.strings[lemma_] else: return self.c.lemma @@ -862,7 +862,7 @@ cdef class Token: """ def __get__(self): if self.c.lemma == 0: - return self.vocab.morphology.lemmatizer.lookup(self.orth_) + return self.vocab.morphology.lemmatizer.lookup(self.orth, self.orth_) else: return self.vocab.strings[self.c.lemma] diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 7e360d409..021da02fc 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -18,10 +18,10 @@ from .structs cimport SerializedLexemeC from .compat import copy_reg, basestring_ from .errors import Errors from .lemmatizer import Lemmatizer -from .lookups import Lookups from .attrs import intify_attrs, NORM from .vectors import Vectors from ._ml import link_vectors_to_models +from .lookups import Lookups from . import util From 03809b82b77819354096c85e00ee2ebf4bb7c21c Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 12 Sep 2019 18:01:46 +0200 Subject: [PATCH 153/207] Support label schemes in model directory --- website/src/components/table.js | 5 ++-- website/src/styles/grid.module.sass | 2 +- website/src/templates/models.js | 46 ++++++++++++++++++++++++++++- 3 files changed, 49 insertions(+), 4 deletions(-) diff --git a/website/src/components/table.js b/website/src/components/table.js index 3c345b046..78646f4c3 100644 --- a/website/src/components/table.js +++ b/website/src/components/table.js @@ -45,9 +45,10 @@ function isFootRow(children) { export const Table = props => <table className={classes.root} {...props} /> export const Th = props => <th className={classes.th} {...props} /> -export const Tr = ({ children, ...props }) => { +export const Tr = ({ evenodd = true, children, ...props }) => { const foot = isFootRow(children) - const trClasssNames = classNames(classes.tr, { + const trClasssNames = classNames({ + [classes.tr]: evenodd, [classes.footer]: foot, 'table-footer': foot, }) diff --git a/website/src/styles/grid.module.sass b/website/src/styles/grid.module.sass index 63ea3d160..482ad03cf 100644 --- a/website/src/styles/grid.module.sass +++ b/website/src/styles/grid.module.sass @@ -37,5 +37,5 @@ $flex-gap: 2rem .narrow grid-column-gap: $grid-gap-narrow -.spacing +.spacing:not(:empty) margin-bottom: var(--spacing-md) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 4713f4b34..29278e919 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -14,6 +14,7 @@ import Icon from '../components/icon' import Link from '../components/link' import Grid from '../components/grid' import Infobox from '../components/infobox' +import Accordion from '../components/accordion' import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util' const MODEL_META = { @@ -43,6 +44,12 @@ const MODEL_META = { compat: 'Latest compatible model version for your spaCy installation', } +const LABEL_SCHEME_META = { + tagger: 'Part-of-speech tags via Token.tag_', + parser: 'Dependency labels via Token.dep_', + ner: 'Named entity labels', +} + const MARKDOWN_COMPONENTS = { code: InlineCode, } @@ -140,6 +147,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl const licenseUrl = licenses[meta.license] ? licenses[meta.license].url : null const license = licenseUrl ? <Link to={licenseUrl}>{meta.license}</Link> : meta.license const hasInteractiveCode = size === 'sm' && hasExamples && !isError + const labels = meta.labels const rows = [ { label: 'Language', tag: langId, content: langName }, @@ -218,7 +226,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl )} </tbody> </Table> - <Grid cols={2} gutterBottom={hasInteractiveCode}> + <Grid cols={2} gutterBottom={hasInteractiveCode || labels}> {accuracy && accuracy.map(({ label, items }, i) => !items ? null : ( @@ -260,6 +268,42 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl ].join('\n')} </CodeBlock> )} + {labels && ( + <Accordion title="Label Scheme"> + <p> + The statistical components included in this model package assign the + following labels. The labels are specific to the corpus that the model was + trained on. To see the description of a label, you can use{' '} + <Link to="/api/top-level#spacy.explain"> + <InlineCode>spacy.explain</InlineCode> + </Link> + . + </p> + <Table> + {Object.keys(labels).map(pipe => { + const labelNames = labels[pipe] || [] + const help = LABEL_SCHEME_META[pipe] + return ( + <Tr key={pipe} evenodd={false}> + <Td nowrap> + <Label> + {pipe} {help && <Help>{help}</Help>} + </Label> + </Td> + <Td> + {labelNames.map((label, i) => ( + <> + {i > 0 && ', '} + <InlineCode key={label}>{label}</InlineCode> + </> + ))} + </Td> + </Tr> + ) + })} + </Table> + </Accordion> + )} </Section> ) } From 29a9e636eb8b0f22f91eee85b4a10b8cdade4ed2 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann <polm@dampfkraft.com> Date: Fri, 13 Sep 2019 23:28:12 +0900 Subject: [PATCH 154/207] Fix half-width space handling in JA (#4284) (closes #4262) Before this patch, half-width spaces between words were simply lost in Japanese text. This wasn't immediately noticeable because much Japanese text never uses spaces at all. --- spacy/lang/ja/__init__.py | 23 ++++++++++++++++++++--- spacy/lang/ja/tag_map.py | 4 +++- spacy/tests/lang/ja/test_tokenizer.py | 6 ++++++ 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 3a6074bba..791b1ec33 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -37,6 +37,11 @@ def resolve_pos(token): in the sentence. This function adds information to the POS tag to resolve ambiguous mappings. """ + + # this is only used for consecutive ascii spaces + if token.pos == '空白': + return '空白' + # TODO: This is a first take. The rules here are crude approximations. # For many of these, full dependencies are needed to properly resolve # PoS mappings. @@ -54,6 +59,7 @@ def detailed_tokens(tokenizer, text): node = tokenizer.parseToNode(text) node = node.next # first node is beginning of sentence and empty, skip it words = [] + spaces = [] while node.posid != 0: surface = node.surface base = surface # a default value. Updated if available later. @@ -64,8 +70,20 @@ def detailed_tokens(tokenizer, text): # dictionary base = parts[7] words.append(ShortUnitWord(surface, base, pos)) + + # The way MeCab stores spaces is that the rlength of the next token is + # the length of that token plus any preceding whitespace, **in bytes**. + # also note that this is only for half-width / ascii spaces. Full width + # spaces just become tokens. + scount = node.next.rlength - node.next.length + spaces.append(bool(scount)) + while scount > 1: + words.append(ShortUnitWord(' ', ' ', '空白')) + spaces.append(False) + scount -= 1 + node = node.next - return words + return words, spaces class JapaneseTokenizer(DummyTokenizer): @@ -75,9 +93,8 @@ class JapaneseTokenizer(DummyTokenizer): self.tokenizer.parseToNode("") # see #2901 def __call__(self, text): - dtokens = detailed_tokens(self.tokenizer, text) + dtokens, spaces = detailed_tokens(self.tokenizer, text) words = [x.surface for x in dtokens] - spaces = [False] * len(words) doc = Doc(self.vocab, words=words, spaces=spaces) mecab_tags = [] for token, dtoken in zip(doc, dtokens): diff --git a/spacy/lang/ja/tag_map.py b/spacy/lang/ja/tag_map.py index 6b114eb10..4ff0a35ee 100644 --- a/spacy/lang/ja/tag_map.py +++ b/spacy/lang/ja/tag_map.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN -from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET +from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE TAG_MAP = { @@ -21,6 +21,8 @@ TAG_MAP = { "感動詞,一般,*,*": {POS: INTJ}, # this is specifically for unicode full-width space "空白,*,*,*": {POS: X}, + # This is used when sequential half-width spaces are present + "空白": {POS: SPACE}, "形状詞,一般,*,*": {POS: ADJ}, "形状詞,タリ,*,*": {POS: ADJ}, "形状詞,助動詞語幹,*,*": {POS: ADJ}, diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index c95e7bc40..38ca37bc9 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -47,3 +47,9 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags): def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): pos = [token.pos_ for token in ja_tokenizer(text)] assert pos == expected_pos + +def test_extra_spaces(ja_tokenizer): + # note: three spaces after "I" + tokens = ja_tokenizer("I like cheese.") + assert tokens[1].orth_ == ' ' + assert tokens[2].orth_ == ' ' From bcbb9f511934a8078d78fd03386dd11a1aac56d1 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 14 Sep 2019 12:57:45 +0200 Subject: [PATCH 155/207] Update README.md [ci skip] --- README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 27a49f465..aa2d059c1 100644 --- a/README.md +++ b/README.md @@ -49,9 +49,12 @@ It's commercial open-source software, released under the MIT license. ## 💬 Where to ask questions The spaCy project is maintained by [@honnibal](https://github.com/honnibal) -and [@ines](https://github.com/ines). Please understand that we won't be able -to provide individual support via email. We also believe that help is much more -valuable if it's shared publicly, so that more people can benefit from it. +and [@ines](https://github.com/ines), along with core contributors +[@svlandeg](https://github.com/svlandeg) and +[@adrianeboyd](https://github.com/adrianeboyd). Please understand that we won't +be able to provide individual support via email. We also believe that help is +much more valuable if it's shared publicly, so that more people can benefit +from it. | Type | Platforms | | ------------------------ | ------------------------------------------------------ | From 3126dd0904018672113c1167605d39cffa007b11 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 14 Sep 2019 12:58:06 +0200 Subject: [PATCH 156/207] Tidy up and auto-format [ci skip] --- spacy/lang/ja/__init__.py | 6 +++--- spacy/tests/lang/ja/test_tokenizer.py | 5 +++-- spacy/tests/lang/lt/test_lemmatizer.py | 4 +++- spacy/tests/vocab_vectors/test_lookups.py | 2 +- website/docs/api/annotation.md | 4 ++-- 5 files changed, 12 insertions(+), 9 deletions(-) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 791b1ec33..056a6893b 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -39,8 +39,8 @@ def resolve_pos(token): """ # this is only used for consecutive ascii spaces - if token.pos == '空白': - return '空白' + if token.pos == "空白": + return "空白" # TODO: This is a first take. The rules here are crude approximations. # For many of these, full dependencies are needed to properly resolve @@ -78,7 +78,7 @@ def detailed_tokens(tokenizer, text): scount = node.next.rlength - node.next.length spaces.append(bool(scount)) while scount > 1: - words.append(ShortUnitWord(' ', ' ', '空白')) + words.append(ShortUnitWord(" ", " ", "空白")) spaces.append(False) scount -= 1 diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index 38ca37bc9..ad8bfaa00 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -48,8 +48,9 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): pos = [token.pos_ for token in ja_tokenizer(text)] assert pos == expected_pos + def test_extra_spaces(ja_tokenizer): # note: three spaces after "I" tokens = ja_tokenizer("I like cheese.") - assert tokens[1].orth_ == ' ' - assert tokens[2].orth_ == ' ' + assert tokens[1].orth_ == " " + assert tokens[2].orth_ == " " diff --git a/spacy/tests/lang/lt/test_lemmatizer.py b/spacy/tests/lang/lt/test_lemmatizer.py index 5c3ed34f8..b98d63935 100644 --- a/spacy/tests/lang/lt/test_lemmatizer.py +++ b/spacy/tests/lang/lt/test_lemmatizer.py @@ -17,4 +17,6 @@ TEST_CASES = [ @pytest.mark.parametrize("tokens,lemmas", TEST_CASES) def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas): - assert lemmas == [lt_lemmatizer.lookup_table.get_string(token, token) for token in tokens] + assert lemmas == [ + lt_lemmatizer.lookup_table.get_string(token, token) for token in tokens + ] diff --git a/spacy/tests/vocab_vectors/test_lookups.py b/spacy/tests/vocab_vectors/test_lookups.py index 7cdf8ff68..daab5e585 100644 --- a/spacy/tests/vocab_vectors/test_lookups.py +++ b/spacy/tests/vocab_vectors/test_lookups.py @@ -78,7 +78,6 @@ def test_lookups_to_from_disk(): assert table2.get_string("b") == 2 - # This fails on Python 3.5 @pytest.mark.xfail def test_lookups_to_from_bytes_via_vocab(): @@ -97,6 +96,7 @@ def test_lookups_to_from_bytes_via_vocab(): assert table.get_string("hello") == "world" assert new_vocab.to_bytes() == vocab_bytes + # This fails on Python 3.5 @pytest.mark.xfail def test_lookups_to_from_disk_via_vocab(): diff --git a/website/docs/api/annotation.md b/website/docs/api/annotation.md index 2c52d197a..f44019752 100644 --- a/website/docs/api/annotation.md +++ b/website/docs/api/annotation.md @@ -80,8 +80,8 @@ training corpus and can be defined in the respective language data's <Accordion title="Universal Part-of-speech Tags" id="pos-universal"> -spaCy also maps all language-specific part-of-speech tags to a small, fixed set -of word type tags following the +spaCy maps all language-specific part-of-speech tags to a small, fixed set of +word type tags following the [Universal Dependencies scheme](http://universaldependencies.org/u/pos/). The universal tags don't code for any morphological features and only cover the word type. They're available as the [`Token.pos`](/api/token#attributes) and From bbf7337eafae2a9b75d3de45cf5167de07f07753 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 14 Sep 2019 15:32:15 +0200 Subject: [PATCH 157/207] Update adding languages docs [ci skip] --- website/docs/usage/adding-languages.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md index d89891297..94d75ea31 100644 --- a/website/docs/usage/adding-languages.md +++ b/website/docs/usage/adding-languages.md @@ -448,6 +448,18 @@ resources = { } ``` +> #### Lookups example +> +> ```python +> table = nlp.vocab.lookups.get_table("my_table") +> value = table.get("some_key") +> ``` + +If your language needs other large dictionaries and resources, you can also add +those files here. The data will become available via a [`Lookups`](/api/lookups) +table in `nlp.vocab.lookups`, and you'll be able to access it from the tokenizer +or a custom pipeline component (via `doc.vocab.lookups`). + ### Tag map {#tag-map} Most treebanks define a custom part-of-speech tag scheme, striking a balance From 5c8b5e68ec6fccb221be29402e033abbc3201b8d Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 14 Sep 2019 16:23:37 +0200 Subject: [PATCH 158/207] Fix docs consistency [ci skip] --- website/docs/api/language.md | 1 + website/docs/usage/linguistic-features.md | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 254ad8fb1..c44339ff5 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -140,6 +140,7 @@ Evaluate a model's pipeline components. | `batch_size` | int | The batch size to use. | | `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. | | `component_cfg` <Tag variant="new">2.1</Tag> | dict | Config parameters for specific pipeline components, keyed by component name. | +| **RETURNS** | Scorer | The scorer containing the evaluation scores. | ## Language.begin_training {#begin_training tag="method"} diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 7549a3985..4128fa73f 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -576,11 +576,11 @@ import DisplacyEntHtml from 'images/displacy-ent2.html' ## Entity Linking {#entity-linking} -To ground the named entities into the "real-world", spaCy provides functionality +To ground the named entities into the "real world", spaCy provides functionality to perform entity linking, which resolves a textual entity to a unique -identifier from a knowledge base (KB). - -The default model assigns WikiData identifiers, but you can create your own +identifier from a knowledge base (KB). The +[processing scripts](https://github.com/explosion/spaCy/tree/master/bin/wiki_entity_linking) +we provide use WikiData identifiers, but you can create your own [`KnowledgeBase`](/api/kb) and [train a new Entity Linking model](/usage/training#entity-linker) using that custom-made KB. From fe87ccc8d14b895e1e3e6df6a54f031d153237ed Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 14 Sep 2019 16:23:50 +0200 Subject: [PATCH 159/207] Update languages.json [ci skip] --- website/meta/languages.json | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/website/meta/languages.json b/website/meta/languages.json index 9fe9706e4..376851cef 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -65,19 +65,14 @@ "example": "Αυτή είναι μια πρόταση.", "has_examples": true }, - { - "code": "xx", - "name": "Multi-language", - "models": ["xx_ent_wiki_sm"], - "example": "This is a sentence about Facebook." - }, { "code": "sv", "name": "Swedish", "has_examples": true }, { "code": "fi", "name": "Finnish", "has_examples": true }, { "code": "nb", "name": "Norwegian Bokmål", "example": "Dette er en setning.", - "has_examples": true + "has_examples": true, + "models": ["nb_core_news_sm", "nb_core_news_md"] }, { "code": "da", "name": "Danish", "example": "Dette er en sætning.", "has_examples": true }, { "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true }, @@ -127,7 +122,7 @@ { "code": "bg", "name": "Bulgarian" }, { "code": "cs", "name": "Czech" }, { "code": "is", "name": "Icelandic" }, - { "code": "lt", "name": "Lithuanian" }, + { "code": "lt", "name": "Lithuanian", "has_examples": true, "models": ["lt_core_news_sm"] }, { "code": "lv", "name": "Latvian" }, { "code": "sr", "name": "Serbian" }, { "code": "sk", "name": "Slovak" }, @@ -182,10 +177,15 @@ "code": "vi", "name": "Vietnamese", "dependencies": [{ "name": "Pyvi", "url": "https://github.com/trungtv/pyvi" }] + }, + { + "code": "xx", + "name": "Multi-language", + "models": ["xx_ent_wiki_sm"], + "example": "This is a sentence about Facebook." } ], "licenses": [ - { "id": "CC BY 4.0", "url": "https://creativecommons.org/licenses/by/4.0/" }, { "id": "CC BY 4.0", "url": "https://creativecommons.org/licenses/by/4.0/" }, { "id": "CC BY-SA", "url": "https://creativecommons.org/licenses/by-sa/3.0/" }, { "id": "CC BY-SA 3.0", "url": "https://creativecommons.org/licenses/by-sa/3.0/" }, From 86befc80bfd0cd2e4ba257168ca4c678a08c11f2 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 14 Sep 2019 16:41:48 +0200 Subject: [PATCH 160/207] WIP: Add v2.2 page [ci skip] --- website/docs/usage/v2-2.md | 292 +++++++++++++++++++++++++++++++++++++ website/meta/sidebars.json | 1 + 2 files changed, 293 insertions(+) create mode 100644 website/docs/usage/v2-2.md diff --git a/website/docs/usage/v2-2.md b/website/docs/usage/v2-2.md new file mode 100644 index 000000000..2109ae812 --- /dev/null +++ b/website/docs/usage/v2-2.md @@ -0,0 +1,292 @@ +--- +title: What's New in v2.2 +teaser: New features, backwards incompatibilities and migration guide +menu: + - ['New Features', 'features'] + - ['Backwards Incompatibilities', 'incompat'] +--- + +## New Features {#features hidden="true"} + +<!-- TODO: summary --> + +### Better pretrained models and more languages {#models} + +> #### Example +> +> ```python +> python -m spacy download nl_core_news_sm +> python -m spacy download nb_core_news_sm +> python -m spacy download nb_core_news_md +> python -m spacy download lt_core_news_sm +> ``` + +The new version also features new and re-trained models for all languages and +resolves a number of data bugs. The [Dutch model](/models/nl) has been retrained +with a new and custom-labelled NER corpus using the same extended label scheme +as the English models. It should now produce significantly better NER results +overall. We've also added new core models for [Norwegian](/models/nb) (MIT) and +[Lithuanian](/models/lt) (CC BY-SA). + +<Infobox> + +**Usage:** [Models directory](/models) **Benchmarks: ** +[Release notes](https://github.com/explosion/spaCy/releases/tag/v2.2.0) + +</Infobox> + +### Entity linking API {#entity-linking} + +> #### Example +> +> ```python +> nlp = spacy.load("my_custom_wikidata_model") +> doc = nlp("Ada Lovelace was born in London") +> print([(e.text, e.label_, e.kb_id_) for e in doc.ents]) +> # [('Ada Lovelace', 'PERSON', 'Q7259'), ('London', 'GPE', 'Q84')] +> ``` + +Entity linking lets you ground named entities into the "real world". We're +excited to now provide a built-in API for training entity linking models and +resolving textual entities to unique identifiers from a knowledge base. The +annotated KB identifier is accessible as either a hash value or as a string from +a `Span` or `Token` object. For more details on entity linking in spaCy, check +out +[Sofie's talk](https://www.youtube.com/watch?v=PW3RJM8tDGo&list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc&index=6) +at spaCy IRL 2019. + +<Infobox> + +**API:** [`EntityLinker`](/api/entitylinker), +[`KnowledgeBase`](/api/knowledgebase) **Code: ** +[`bin/wiki_entity_linking`](https://github.com/explosion/spaCy/tree/master/bin/wiki_entity_linking) +**Usage: ** [Entity linking](/usage/linguistic-features#entity-linking), +[Training an entity linking model](/usage/training#entity-linker) + +</Infobox> + +### Serializable lookup table and dictionary API {#lookups} + +> #### Example +> +> ```python +> data = {"foo": "bar"} +> nlp.vocab.lookups.add_table("my_dict", data) +> +> def custom_component(doc): +> table = doc.vocab.lookups.get_table("my_dict") +> print(table.get("foo")) # look something up +> return doc +> ``` + +The new `Lookups` API lets you add large dictionaries and lookup tables to the +`Vocab` and access them from the tokenizer or custom components and extension +attributes. Internally, the tables use Bloom filters for efficient lookup +checks. They're also fully serializable out-of-the-box. All large data resources +included with spaCy now use this API and are additionally compressed at build +time. This allowed us to make the installed library roughly **15 times smaller +on disk**. + +<Infobox> + +**API:** [`Lookups`](/api/lookups) **Usage: ** +[Adding languages: Lemmatizer](/usage/adding-languages#lemmatizer) + +</Infobox> + +### Text classification scores and CLI training {#train-textcat-cli} + +> #### Example +> +> ```python +> scorer = nlp.evaluate(dev_data) +> print(scorer.textcat_scores, scorer.textcats_per_cat) +> ``` + +When training your models using the `spacy train` command, you can now also +include text categories in the JSON-formatted training data. The `Scorer` and +`nlp.evaluate` now report the text classification scores, calculated as the +F-score on positive label for binary exclusive tasks, the macro-averaged F-score +for 3+ exclusive labels or the macro-averaged AUC ROC score for multilabel +classification. + +<Infobox> + +**API:** [`spacy train`](/api/cli#train), [`Scorer`](/api/scorer), +[`Language.evaluate`](/api/language#evaluate) + +</Infobox> + +### CLI command to debug and validate training data {#debug-data} + +> #### Example +> +> ```bash +> $ python -m spacy debug-data en train.json dev.json +> ``` + +The new `debug-data` command lets you analyze and validate your training and +development data, get useful stats, and find problems like invalid entity +annotations, cyclic dependencies, low data labels and more. If you're training a +model with `spacy train` and the results seem surprising or confusing, +`debug-data` may help you track down the problems and improve your training +data. + +<Accordion title="Example output"> + +``` +=========================== Data format validation =========================== +✔ Corpus is loadable + +=============================== Training stats =============================== +Training pipeline: tagger, parser, ner +Starting with blank model 'en' +18127 training docs +2939 evaluation docs +⚠ 34 training examples also in evaluation data + +============================== Vocab & Vectors ============================== +ℹ 2083156 total words in the data (56962 unique) +⚠ 13020 misaligned tokens in the training data +⚠ 2423 misaligned tokens in the dev data +10 most common words: 'the' (98429), ',' (91756), '.' (87073), 'to' (50058), +'of' (49559), 'and' (44416), 'a' (34010), 'in' (31424), 'that' (22792), 'is' +(18952) +ℹ No word vectors present in the model + +========================== Named Entity Recognition ========================== +ℹ 18 new labels, 0 existing labels +528978 missing values (tokens with '-' label) +New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL' +(10490), 'NORP' (9033), 'MONEY' (5164), 'PERCENT' (3761), 'ORDINAL' (2122), +'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC' +(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338) +✔ Good amount of examples for all labels +✔ Examples without occurences available for all labels +✔ No entities consisting of or starting/ending with whitespace + +=========================== Part-of-speech Tagging =========================== +ℹ 49 labels in data (57 labels in tag map) +'NN' (266331), 'IN' (227365), 'DT' (185600), 'NNP' (164404), 'JJ' (119830), +'NNS' (110957), '.' (101482), ',' (92476), 'RB' (90090), 'PRP' (90081), 'VB' +(74538), 'VBD' (68199), 'CC' (62862), 'VBZ' (50712), 'VBP' (43420), 'VBN' +(42193), 'CD' (40326), 'VBG' (34764), 'TO' (31085), 'MD' (25863), 'PRP$' +(23335), 'HYPH' (13833), 'POS' (13427), 'UH' (13322), 'WP' (10423), 'WDT' +(9850), 'RP' (8230), 'WRB' (8201), ':' (8168), '''' (7392), '``' (6984), 'NNPS' +(5817), 'JJR' (5689), '$' (3710), 'EX' (3465), 'JJS' (3118), 'RBR' (2872), +'-RRB-' (2825), '-LRB-' (2788), 'PDT' (2078), 'XX' (1316), 'RBS' (1142), 'FW' +(794), 'NFP' (557), 'SYM' (440), 'WP$' (294), 'LS' (293), 'ADD' (191), 'AFX' +(24) +✔ All labels present in tag map for language 'en' + +============================= Dependency Parsing ============================= +ℹ Found 111703 sentences with an average length of 18.6 words. +ℹ Found 2251 nonprojective train sentences +ℹ Found 303 nonprojective dev sentences +ℹ 47 labels in train data +ℹ 211 labels in projectivized train data +'punct' (236796), 'prep' (188853), 'pobj' (182533), 'det' (172674), 'nsubj' +(169481), 'compound' (116142), 'ROOT' (111697), 'amod' (107945), 'dobj' (93540), +'aux' (86802), 'advmod' (86197), 'cc' (62679), 'conj' (59575), 'poss' (36449), +'ccomp' (36343), 'advcl' (29017), 'mark' (27990), 'nummod' (24582), 'relcl' +(21359), 'xcomp' (21081), 'attr' (18347), 'npadvmod' (17740), 'acomp' (17204), +'auxpass' (15639), 'appos' (15368), 'neg' (15266), 'nsubjpass' (13922), 'case' +(13408), 'acl' (12574), 'pcomp' (10340), 'nmod' (9736), 'intj' (9285), 'prt' +(8196), 'quantmod' (7403), 'dep' (4300), 'dative' (4091), 'agent' (3908), 'expl' +(3456), 'parataxis' (3099), 'oprd' (2326), 'predet' (1946), 'csubj' (1494), +'subtok' (1147), 'preconj' (692), 'meta' (469), 'csubjpass' (64), 'iobj' (1) +⚠ Low number of examples for label 'iobj' (1) +⚠ Low number of examples for 130 labels in the projectivized dependency +trees used for training. You may want to projectivize labels such as punct +before training in order to improve parser performance. +⚠ Projectivized labels with low numbers of examples: appos||attr: 12 +advmod||dobj: 13 prep||ccomp: 12 nsubjpass||ccomp: 15 pcomp||prep: 14 +amod||dobj: 9 attr||xcomp: 14 nmod||nsubj: 17 prep||advcl: 2 prep||prep: 5 +nsubj||conj: 12 advcl||advmod: 18 ccomp||advmod: 11 ccomp||pcomp: 5 acl||pobj: +10 npadvmod||acomp: 7 dobj||pcomp: 14 nsubjpass||pcomp: 1 nmod||pobj: 8 +amod||attr: 6 nmod||dobj: 12 aux||conj: 1 neg||conj: 1 dative||xcomp: 11 +pobj||dative: 3 xcomp||acomp: 19 advcl||pobj: 2 nsubj||advcl: 2 csubj||ccomp: 1 +advcl||acl: 1 relcl||nmod: 2 dobj||advcl: 10 advmod||advcl: 3 nmod||nsubjpass: 6 +amod||pobj: 5 cc||neg: 1 attr||ccomp: 16 advcl||xcomp: 3 nmod||attr: 4 +advcl||nsubjpass: 5 advcl||ccomp: 4 ccomp||conj: 1 punct||acl: 1 meta||acl: 1 +parataxis||acl: 1 prep||acl: 1 amod||nsubj: 7 ccomp||ccomp: 3 acomp||xcomp: 5 +dobj||acl: 5 prep||oprd: 6 advmod||acl: 2 dative||advcl: 1 pobj||agent: 5 +xcomp||amod: 1 dep||advcl: 1 prep||amod: 8 relcl||compound: 1 advcl||csubj: 3 +npadvmod||conj: 2 npadvmod||xcomp: 4 advmod||nsubj: 3 ccomp||amod: 7 +advcl||conj: 1 nmod||conj: 2 advmod||nsubjpass: 2 dep||xcomp: 2 appos||ccomp: 1 +advmod||dep: 1 advmod||advmod: 5 aux||xcomp: 8 dep||advmod: 1 dative||ccomp: 2 +prep||dep: 1 conj||conj: 1 dep||ccomp: 4 cc||ROOT: 1 prep||ROOT: 1 nsubj||pcomp: +3 advmod||prep: 2 relcl||dative: 1 acl||conj: 1 advcl||attr: 4 prep||npadvmod: 1 +nsubjpass||xcomp: 1 neg||advmod: 1 xcomp||oprd: 1 advcl||advcl: 1 dobj||dep: 3 +nsubjpass||parataxis: 1 attr||pcomp: 1 ccomp||parataxis: 1 advmod||attr: 1 +nmod||oprd: 1 appos||nmod: 2 advmod||relcl: 1 appos||npadvmod: 1 appos||conj: 1 +prep||expl: 1 nsubjpass||conj: 1 punct||pobj: 1 cc||pobj: 1 conj||pobj: 1 +punct||conj: 1 ccomp||dep: 1 oprd||xcomp: 3 ccomp||xcomp: 1 ccomp||nsubj: 1 +nmod||dep: 1 xcomp||ccomp: 1 acomp||advcl: 1 intj||advmod: 1 advmod||acomp: 2 +relcl||oprd: 1 advmod||prt: 1 advmod||pobj: 1 appos||nummod: 1 relcl||npadvmod: +3 mark||advcl: 1 aux||ccomp: 1 amod||nsubjpass: 1 npadvmod||advmod: 1 conj||dep: +1 nummod||pobj: 1 amod||npadvmod: 1 intj||pobj: 1 nummod||npadvmod: 1 +xcomp||xcomp: 1 aux||dep: 1 advcl||relcl: 1 +⚠ The following labels were found only in the train data: xcomp||amod, +advcl||relcl, prep||nsubjpass, acl||nsubj, nsubjpass||conj, xcomp||oprd, +advmod||conj, advmod||advmod, iobj, advmod||nsubjpass, dobj||conj, ccomp||amod, +meta||acl, xcomp||xcomp, prep||attr, prep||ccomp, advcl||acomp, acl||dobj, +advcl||advcl, pobj||agent, prep||advcl, nsubjpass||xcomp, prep||dep, +acomp||xcomp, aux||ccomp, ccomp||dep, conj||dep, relcl||compound, +nsubjpass||ccomp, nmod||dobj, advmod||advcl, advmod||acl, dobj||advcl, +dative||xcomp, prep||nsubj, ccomp||ccomp, nsubj||ccomp, xcomp||acomp, +prep||acomp, dep||advmod, acl||pobj, appos||dobj, npadvmod||acomp, cc||ROOT, +relcl||nsubj, nmod||pobj, acl||nsubjpass, ccomp||advmod, pcomp||prep, +amod||dobj, advmod||attr, advcl||csubj, appos||attr, dobj||pcomp, prep||ROOT, +relcl||pobj, advmod||pobj, amod||nsubj, ccomp||xcomp, prep||oprd, +npadvmod||advmod, appos||nummod, advcl||pobj, neg||advmod, acl||attr, +appos||nsubjpass, csubj||ccomp, amod||nsubjpass, intj||pobj, dep||advcl, +cc||neg, xcomp||ccomp, dative||ccomp, nmod||oprd, pobj||dative, prep||dobj, +dep||ccomp, relcl||attr, ccomp||nsubj, advcl||xcomp, nmod||dep, advcl||advmod, +ccomp||conj, pobj||prep, advmod||acomp, advmod||relcl, attr||pcomp, +ccomp||parataxis, oprd||xcomp, intj||advmod, nmod||nsubjpass, prep||npadvmod, +parataxis||acl, prep||pobj, advcl||dobj, amod||pobj, prep||acl, conj||pobj, +advmod||dep, punct||pobj, ccomp||acomp, acomp||advcl, nummod||npadvmod, +dobj||dep, npadvmod||xcomp, advcl||conj, relcl||npadvmod, punct||acl, +relcl||dobj, dobj||xcomp, nsubjpass||parataxis, dative||advcl, relcl||nmod, +advcl||ccomp, appos||npadvmod, ccomp||pcomp, prep||amod, mark||advcl, +prep||advmod, prep||xcomp, appos||nsubj, attr||ccomp, advmod||prt, dobj||ccomp, +aux||conj, advcl||nsubj, conj||conj, advmod||ccomp, advcl||nsubjpass, +attr||xcomp, nmod||conj, npadvmod||conj, relcl||dative, prep||expl, +nsubjpass||pcomp, advmod||xcomp, advmod||dobj, appos||pobj, nsubj||conj, +relcl||nsubjpass, advcl||attr, appos||ccomp, advmod||prep, prep||conj, +nmod||attr, punct||conj, neg||conj, dep||xcomp, aux||xcomp, dobj||acl, +nummod||pobj, amod||npadvmod, nsubj||pcomp, advcl||acl, appos||nmod, +relcl||oprd, prep||prep, cc||pobj, nmod||nsubj, amod||attr, aux||dep, +appos||conj, advmod||nsubj, nsubj||advcl, acl||conj +To train a parser, your data should include at least 20 instances of each label. +⚠ Multiple root labels (ROOT, nsubj, aux, npadvmod, prep) found in +training data. spaCy's parser uses a single root label ROOT so this distinction +will not be available. + +================================== Summary ================================== +✔ 5 checks passed +⚠ 8 warnings +``` + +</Accordion> + +<Infobox> + +**API:** [`spacy debug-data`](/api/cli#debug-data) + +</Infobox> + +## Backwards incompatibilities {#incompat} + +<Infobox title="Important note on models" variant="warning"> + +If you've been training **your own models**, you'll need to **retrain** them +with the new version. Also don't forget to upgrade all models to the latest +versions. Models for v2.0 or v2.1 aren't compatible with models for v2.2. To +check if all of your models are up to date, you can run the +[`spacy validate`](/api/cli#validate) command. + +</Infobox> + +<!-- TODO: copy from release notes once they're ready --> diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index a05440e5a..7c6affe70 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -9,6 +9,7 @@ { "text": "Models & Languages", "url": "/usage/models" }, { "text": "Facts & Figures", "url": "/usage/facts-figures" }, { "text": "spaCy 101", "url": "/usage/spacy-101" }, + { "text": "New in v2.2", "url": "/usage/v2-2" }, { "text": "New in v2.1", "url": "/usage/v2-1" }, { "text": "New in v2.0", "url": "/usage/v2" } ] From 71948452340bfb9ccaa2cb3d8e12e0fb8ac9f0f9 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sun, 15 Sep 2019 17:00:17 +0200 Subject: [PATCH 161/207] Skip tests properly instead of xfailing them --- spacy/tests/vocab_vectors/test_lookups.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/spacy/tests/vocab_vectors/test_lookups.py b/spacy/tests/vocab_vectors/test_lookups.py index daab5e585..b395438b6 100644 --- a/spacy/tests/vocab_vectors/test_lookups.py +++ b/spacy/tests/vocab_vectors/test_lookups.py @@ -36,8 +36,7 @@ def test_lookups_api(): lookups.get_table(table_name) -# This fails on Python 3.5 -@pytest.mark.xfail +@pytest.mark.xfail(reason="This fails on Python 3.5") def test_lookups_to_from_bytes(): lookups = Lookups() lookups.add_table("table1", {"foo": "bar", "hello": "world"}) @@ -57,8 +56,7 @@ def test_lookups_to_from_bytes(): assert new_lookups.to_bytes() == lookups_bytes -# This fails on Python 3.5 -@pytest.mark.xfail +@pytest.mark.skip(reason="This fails on Python 3.5") def test_lookups_to_from_disk(): lookups = Lookups() lookups.add_table("table1", {"foo": "bar", "hello": "world"}) @@ -78,8 +76,7 @@ def test_lookups_to_from_disk(): assert table2.get_string("b") == 2 -# This fails on Python 3.5 -@pytest.mark.xfail +@pytest.mark.skip(reason="This fails on Python 3.5") def test_lookups_to_from_bytes_via_vocab(): table_name = "test" vocab = Vocab() @@ -97,8 +94,7 @@ def test_lookups_to_from_bytes_via_vocab(): assert new_vocab.to_bytes() == vocab_bytes -# This fails on Python 3.5 -@pytest.mark.xfail +@pytest.mark.skip(reason="This fails on Python 3.5") def test_lookups_to_from_disk_via_vocab(): table_name = "test" vocab = Vocab() From aa3c59a2f3b8d3d272c4a764d4496a79354f31b5 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sun, 15 Sep 2019 17:16:21 +0200 Subject: [PATCH 162/207] Include Norwegian NER entity types in glossary [ci skip] See https://github.com/ltgoslo/norne --- spacy/glossary.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/spacy/glossary.py b/spacy/glossary.py index ff38e7138..52abc7bb5 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -307,4 +307,10 @@ GLOSSARY = { # https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf "PER": "Named person or family.", "MISC": "Miscellaneous entities, e.g. events, nationalities, products or works of art", + # https://github.com/ltgoslo/norne + "EVT": "Festivals, cultural events, sports events, weather phenomena, wars, etc.", + "PROD": "Product, i.e. artificially produced entities including speeches, radio shows, programming languages, contracts, laws and ideas", + "DRV": "Words (and phrases?) that are dervied from a name, but not a name in themselves, e.g. 'Oslo-mannen' ('the man from Oslo')", + "GPE_LOC": "Geo-political entity, with a locative sense, e.g. 'John lives in Spain'", + "GPE_ORG": "Geo-political entity, with an organisation sense, e.g. 'Spain declined to meet with Belgium'", } From 88a9d87f6f5918d50c1bd3974661112068053b9e Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sun, 15 Sep 2019 18:04:44 +0200 Subject: [PATCH 163/207] Fix test --- spacy/tests/vocab_vectors/test_lookups.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/vocab_vectors/test_lookups.py b/spacy/tests/vocab_vectors/test_lookups.py index b395438b6..541e1b63e 100644 --- a/spacy/tests/vocab_vectors/test_lookups.py +++ b/spacy/tests/vocab_vectors/test_lookups.py @@ -36,7 +36,7 @@ def test_lookups_api(): lookups.get_table(table_name) -@pytest.mark.xfail(reason="This fails on Python 3.5") +@pytest.mark.skip(reason="This fails on Python 3.5") def test_lookups_to_from_bytes(): lookups = Lookups() lookups.add_table("table1", {"foo": "bar", "hello": "world"}) From bab9976d9ab15dfef77e4a59f0c715d55cbca069 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sun, 15 Sep 2019 22:08:13 +0200 Subject: [PATCH 164/207] =?UTF-8?q?=F0=9F=92=AB=20=20Adjust=20Table=20API?= =?UTF-8?q?=20and=20add=20docs=20(#4289)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Adjust Table API and add docs * Add attributes and update description [ci skip] * Use strings.get_string_id instead of hash_string * Fix table method calls * Make orth arg in Lemmatizer.lookup optional Fall back to string, which is now handled by Table.__contains__ out-of-the-box * Fix method name * Auto-format --- spacy/lang/el/lemmatizer/__init__.py | 7 +- spacy/lang/fr/lemmatizer/__init__.py | 4 +- spacy/lang/nl/lemmatizer/__init__.py | 12 +-- spacy/lang/ru/lemmatizer.py | 2 +- spacy/lang/uk/lemmatizer.py | 2 +- spacy/lemmatizer.py | 16 ++- spacy/lookups.py | 117 +++++++++++++--------- spacy/morphology.pyx | 4 +- spacy/tests/lang/lt/test_lemmatizer.py | 4 +- spacy/tests/lang/nl/test_lemmatizer.py | 4 +- spacy/tests/vocab_vectors/test_lookups.py | 57 +++++++++-- spacy/tokens/token.pyx | 4 +- website/docs/api/lemmatizer.md | 13 +-- website/docs/api/lookups.md | 73 +++++++++++--- 14 files changed, 215 insertions(+), 104 deletions(-) diff --git a/spacy/lang/el/lemmatizer/__init__.py b/spacy/lang/el/lemmatizer/__init__.py index bc5c00bd8..994bf9c16 100644 --- a/spacy/lang/el/lemmatizer/__init__.py +++ b/spacy/lang/el/lemmatizer/__init__.py @@ -46,9 +46,10 @@ class GreekLemmatizer(object): ) return lemmas - def lookup(self, orth, string): - if orth in self.lookup_table: - return self.lookup_table[orth] + def lookup(self, string, orth=None): + key = orth if orth is not None else string + if key in self.lookup_table: + return self.lookup_table[key] return string diff --git a/spacy/lang/fr/lemmatizer/__init__.py b/spacy/lang/fr/lemmatizer/__init__.py index 879f2c80c..dfd822188 100644 --- a/spacy/lang/fr/lemmatizer/__init__.py +++ b/spacy/lang/fr/lemmatizer/__init__.py @@ -52,7 +52,7 @@ class FrenchLemmatizer(object): elif univ_pos in (SCONJ, "SCONJ", "sconj"): univ_pos = "sconj" else: - return [self.lookup(None, string)] + return [self.lookup(string)] # See Issue #435 for example of where this logic is requied. if self.is_base_form(univ_pos, morphology): return list(set([string.lower()])) @@ -114,7 +114,7 @@ class FrenchLemmatizer(object): def punct(self, string, morphology=None): return self(string, "punct", morphology) - def lookup(self, orth, string): + def lookup(self, string, orth=None): if orth is not None and orth in self.lookup_table: return self.lookup_table[orth][0] return string diff --git a/spacy/lang/nl/lemmatizer/__init__.py b/spacy/lang/nl/lemmatizer/__init__.py index db345c088..ee4eaabb3 100644 --- a/spacy/lang/nl/lemmatizer/__init__.py +++ b/spacy/lang/nl/lemmatizer/__init__.py @@ -62,11 +62,11 @@ class DutchLemmatizer(object): # are not lemmatized. They are lowercased, however. return [string] # if string in self.lemma_index.get(univ_pos) - lemma_index = self.index.get_string(univ_pos, {}) + lemma_index = self.index.get(univ_pos, {}) # string is already lemma if string in lemma_index: return [string] - exceptions = self.exc.get_string(univ_pos, {}) + exceptions = self.exc.get(univ_pos, {}) # string is irregular token contained in exceptions index. try: lemma = exceptions[string] @@ -75,12 +75,12 @@ class DutchLemmatizer(object): pass # string corresponds to key in lookup table lookup_table = self.lookup_table - looked_up_lemma = lookup_table.get_string(string) + looked_up_lemma = lookup_table.get(string) if looked_up_lemma and looked_up_lemma in lemma_index: return [looked_up_lemma] forms, is_known = lemmatize( - string, lemma_index, exceptions, self.rules.get_string(univ_pos, []) + string, lemma_index, exceptions, self.rules.get(univ_pos, []) ) # Back-off through remaining return value candidates. @@ -103,12 +103,12 @@ class DutchLemmatizer(object): # Overrides parent method so that a lowercased version of the string is # used to search the lookup table. This is necessary because our lookup # table consists entirely of lowercase keys. - def lookup(self, orth, string): + def lookup(self, string, orth=None): string = string.lower() if orth is not None: return self.lookup_table.get(orth, string) else: - return self.lookup_table.get_string(string, string) + return self.lookup_table.get(string, string) def noun(self, string, morphology=None): return self(string, "noun", morphology) diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 9fc600eb8..638565b6c 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -115,7 +115,7 @@ class RussianLemmatizer(Lemmatizer): def pron(self, string, morphology=None): return self(string, "pron", morphology) - def lookup(self, orth, string): + def lookup(self, string, orth=None): analyses = self._morph.parse(string) if len(analyses) == 1: return analyses[0].normal_form diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index ea2c32ee3..cf7591ea8 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -112,7 +112,7 @@ class UkrainianLemmatizer(Lemmatizer): def pron(self, string, morphology=None): return self(string, "pron", morphology) - def lookup(self, orth, string): + def lookup(self, string, orth=None): analyses = self._morph.parse(string) if len(analyses) == 1: return analyses[0].normal_form diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index cfedd7a9d..26c2227a0 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -93,9 +93,19 @@ class Lemmatizer(object): def punct(self, string, morphology=None): return self(string, "punct", morphology) - def lookup(self, orth, string): - if orth in self.lookup_table: - return self.lookup_table[orth] + def lookup(self, string, orth=None): + """Look up a lemma in the table, if available. If no lemma is found, + the original string is returned. + + string (unicode): The original string. + orth (int): Optional hash of the string to look up. If not set, the + string will be used and hashed. + RETURNS (unicode): The lemma if the string was found, otherwise the + original string. + """ + key = orth if orth is not None else string + if key in self.lookup_table: + return self.lookup_table[key] return string diff --git a/spacy/lookups.py b/spacy/lookups.py index b3b67ae7b..7d100520f 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -3,15 +3,19 @@ from __future__ import unicode_literals import srsly from collections import OrderedDict +from preshed.bloom import BloomFilter from .errors import Errors from .util import SimpleFrozenDict, ensure_path -from .strings import hash_string +from .compat import basestring_ +from .strings import get_string_id -from . import util -import srsly -from preshed.bloom import BloomFilter +def ensure_hash(key): + if isinstance(key, basestring_): + return get_string_id(key) + return key + class Lookups(object): """Container for large lookup tables and dictionaries, e.g. lemmatization @@ -102,7 +106,7 @@ class Lookups(object): """ for key, value in srsly.msgpack_loads(bytes_data).items(): self._tables[key] = Table(key) - self._tables[key].update_raw(value) + self._tables[key].update(value) return self def to_disk(self, path, **kwargs): @@ -137,7 +141,7 @@ class Lookups(object): class Table(OrderedDict): """A table in the lookups. Subclass of builtin dict that implements a - slightly more consistent and unified API. + slightly more consistent and unified API. Includes a Bloom filter to speed up missed lookups. """ @@ -163,72 +167,85 @@ class Table(OrderedDict): """ OrderedDict.__init__(self) self.name = name - # assume a default size of 1M items - size = 1E6 - if data and len(data) > 0: - size = len(data) - + # Assume a default size of 1M items + self.default_size = 1e6 + size = len(data) if data and len(data) > 0 else self.default_size self.bloom = BloomFilter.from_error_rate(size) - if data: self.update(data) - def set(self, key, value): - """Set new key/value pair, where key is an integer. Same as - table[key] = value. - """ - self[key] = value - def __setitem__(self, key, value): + """Set new key/value pair. String keys will be hashed. + + key (unicode / int): The key to set. + value: The value to set. + """ + key = ensure_hash(key) OrderedDict.__setitem__(self, key, value) self.bloom.add(key) - def set_string(self, key, value): - """Set new key/value pair, where key is a string to be hashed. - """ - hkey = hash_string(key) - self.set(hkey, value) + def set(self, key, value): + """Set new key/value pair. String keys will be hashed. + Same as table[key] = value. - def update(self, data): - """Add entries in a dict-like to the table, where keys are strings to - be hashed. + key (unicode / int): The key to set. + value: The value to set. """ - for key, val in data.items(): - self.set_string(key, val) + self[key] = value - def update_raw(self, data): - """Add entries in a dict-like to the table, where keys are ints. + def __getitem__(self, key): + """Get the value for a given key. String keys will be hashed. + + key (unicode / int): The key to get. + RETURNS: The value. """ - for key, val in data.items(): - self.set(key, val) + key = ensure_hash(key) + return OrderedDict.__getitem__(self, key) def get(self, key, default=None): + """Get the value for a given key. String keys will be hashed. + + key (unicode / int): The key to get. + default: The default value to return. + RETURNS: The value. + """ + key = ensure_hash(key) return OrderedDict.get(self, key, default) - def get_string(self, key, default=None): - hkey = hash_string(key) - return OrderedDict.get(self, hkey, default) - def __contains__(self, key): + """Check whether a key is in the table. String keys will be hashed. + + key (unicode / int): The key to check. + RETURNS (bool): Whether the key is in the table. + """ + key = ensure_hash(key) # This can give a false positive, so we need to check it after - if key not in self.bloom: + if key not in self.bloom: return False return OrderedDict.__contains__(self, key) - def contains_string(self, key): - hkey = hash_string(key) - return self.__contains__(hkey) - def to_bytes(self): - # TODO: serialize bloom too. For now just reconstruct it. - return srsly.msgpack_dumps({'name': self.name, 'dict': dict(self.items())}) + """Serialize table to a bytestring. - def from_bytes(self, data): - loaded = srsly.msgpack_loads(data) - self.name = loaded['name'] - for key, val in loaded['dict'].items(): - self[key] = val - self.bloom.add(key) + RETURNS (bytes): The serialized table. + """ + data = [ + ("name", self.name), + ("dict", dict(self.items())), + ("bloom", self.bloom.to_bytes()), + ] + return srsly.msgpack_dumps(OrderedDict(data)) + def from_bytes(self, bytes_data): + """Load a table from a bytestring. + + bytes_data (bytes): The data to load. + RETURNS (Table): The loaded table. + """ + loaded = srsly.msgpack_loads(bytes_data) + data = loaded.get("dict", {}) + self.name = loaded["name"] + self.bloom = BloomFilter().from_bytes(loaded["bloom"]) + self.clear() + self.update(data) return self - diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 8cc27fb7d..190ca8d00 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -72,7 +72,7 @@ def _normalize_props(props): for key in FIELDS: if key in props: value = str(props[key]).lower() - # We don't have support for disjunctive int|rel features, so + # We don't have support for disjunctive int|rel features, so # just take the first one :( if "|" in value: value = value.split("|")[0] @@ -273,7 +273,7 @@ cdef class Morphology: """ if token.lemma == 0: orth_str = self.strings[token.lex.orth] - lemma = self.lemmatizer.lookup(token.lex.orth, orth_str) + lemma = self.lemmatizer.lookup(orth_str, orth=token.lex.orth) token.lemma = self.strings.add(lemma) cdef int assign_tag(self, TokenC* token, tag_str) except -1: diff --git a/spacy/tests/lang/lt/test_lemmatizer.py b/spacy/tests/lang/lt/test_lemmatizer.py index b98d63935..f7408fc16 100644 --- a/spacy/tests/lang/lt/test_lemmatizer.py +++ b/spacy/tests/lang/lt/test_lemmatizer.py @@ -17,6 +17,4 @@ TEST_CASES = [ @pytest.mark.parametrize("tokens,lemmas", TEST_CASES) def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas): - assert lemmas == [ - lt_lemmatizer.lookup_table.get_string(token, token) for token in tokens - ] + assert lemmas == [lt_lemmatizer.lookup_table.get(token, token) for token in tokens] diff --git a/spacy/tests/lang/nl/test_lemmatizer.py b/spacy/tests/lang/nl/test_lemmatizer.py index 93dd1e5e3..dae9091b7 100644 --- a/spacy/tests/lang/nl/test_lemmatizer.py +++ b/spacy/tests/lang/nl/test_lemmatizer.py @@ -133,11 +133,11 @@ def test_nl_lemmatizer_pronoun_lemmas(nl_lemmatizer, text, lemma): # Using the lemma lookup table only @pytest.mark.parametrize("text,lemma", noun_irreg_lemmatization_cases) def test_nl_lemmatizer_lookup_noun(nl_lemmatizer, text, lemma): - lemma_pred = nl_lemmatizer.lookup(None, text) + lemma_pred = nl_lemmatizer.lookup(text) assert lemma_pred in (lemma, text) @pytest.mark.parametrize("text,lemma", verb_irreg_lemmatization_cases) def test_nl_lemmatizer_lookup_verb(nl_lemmatizer, text, lemma): - lemma_pred = nl_lemmatizer.lookup(None, text) + lemma_pred = nl_lemmatizer.lookup(text) assert lemma_pred in (lemma, text) diff --git a/spacy/tests/vocab_vectors/test_lookups.py b/spacy/tests/vocab_vectors/test_lookups.py index 541e1b63e..02f25532a 100644 --- a/spacy/tests/vocab_vectors/test_lookups.py +++ b/spacy/tests/vocab_vectors/test_lookups.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import pytest -from spacy.lookups import Lookups +from spacy.lookups import Lookups, Table, ensure_hash from spacy.vocab import Vocab from ..util import make_tempdir @@ -19,9 +19,9 @@ def test_lookups_api(): table = lookups.get_table(table_name) assert table.name == table_name assert len(table) == 2 - assert table.get_string("hello") == "world" - table.set_string("a", "b") - assert table.get_string("a") == "b" + assert table["hello"] == "world" + table["a"] = "b" + assert table["a"] == "b" table = lookups.get_table(table_name) assert len(table) == 3 with pytest.raises(KeyError): @@ -36,6 +36,43 @@ def test_lookups_api(): lookups.get_table(table_name) +def test_table_api(): + table = Table(name="table") + assert table.name == "table" + assert len(table) == 0 + assert "abc" not in table + data = {"foo": "bar", "hello": "world"} + table = Table(name="table", data=data) + assert len(table) == len(data) + assert "foo" in table + assert ensure_hash("foo") in table + assert table["foo"] == "bar" + assert table[ensure_hash("foo")] == "bar" + assert table.get("foo") == "bar" + assert table.get("abc") is None + table["abc"] = 123 + assert table["abc"] == 123 + assert table[ensure_hash("abc")] == 123 + table.set("def", 456) + assert table["def"] == 456 + assert table[ensure_hash("def")] == 456 + + +def test_table_api_to_from_bytes(): + data = {"foo": "bar", "hello": "world", "abc": 123} + table = Table(name="table", data=data) + table_bytes = table.to_bytes() + new_table = Table().from_bytes(table_bytes) + assert new_table.name == "table" + assert len(new_table) == 3 + assert new_table["foo"] == "bar" + assert new_table[ensure_hash("foo")] == "bar" + new_table2 = Table(data={"def": 456}) + new_table2.from_bytes(table_bytes) + assert len(new_table2) == 3 + assert "def" not in new_table2 + + @pytest.mark.skip(reason="This fails on Python 3.5") def test_lookups_to_from_bytes(): lookups = Lookups() @@ -49,10 +86,10 @@ def test_lookups_to_from_bytes(): assert "table2" in new_lookups table1 = new_lookups.get_table("table1") assert len(table1) == 2 - assert table1.get_string("foo") == "bar" + assert table1["foo"] == "bar" table2 = new_lookups.get_table("table2") assert len(table2) == 3 - assert table2.get_string("b") == 2 + assert table2["b"] == 2 assert new_lookups.to_bytes() == lookups_bytes @@ -70,10 +107,10 @@ def test_lookups_to_from_disk(): assert "table2" in new_lookups table1 = new_lookups.get_table("table1") assert len(table1) == 2 - assert table1.get_string("foo") == "bar" + assert table1["foo"] == "bar" table2 = new_lookups.get_table("table2") assert len(table2) == 3 - assert table2.get_string("b") == 2 + assert table2["b"] == 2 @pytest.mark.skip(reason="This fails on Python 3.5") @@ -90,7 +127,7 @@ def test_lookups_to_from_bytes_via_vocab(): assert table_name in new_vocab.lookups table = new_vocab.lookups.get_table(table_name) assert len(table) == 2 - assert table.get_string("hello") == "world" + assert table["hello"] == "world" assert new_vocab.to_bytes() == vocab_bytes @@ -109,4 +146,4 @@ def test_lookups_to_from_disk_via_vocab(): assert table_name in new_vocab.lookups table = new_vocab.lookups.get_table(table_name) assert len(table) == 2 - assert table.get_string("hello") == "world" + assert table["hello"] == "world" diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index dfe42d2bd..e27b767a7 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -335,7 +335,7 @@ cdef class Token: """ def __get__(self): if self.c.lemma == 0: - lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth, self.orth_) + lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth) return self.vocab.strings[lemma_] else: return self.c.lemma @@ -862,7 +862,7 @@ cdef class Token: """ def __get__(self): if self.c.lemma == 0: - return self.vocab.morphology.lemmatizer.lookup(self.orth, self.orth_) + return self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth) else: return self.vocab.strings[self.c.lemma] diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index 8b6d9dcf6..805e96b0f 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -52,8 +52,8 @@ Lemmatize a string. Look up a lemma in the lookup table, if available. If no lemma is found, the original string is returned. Languages can provide a -[lookup table](/usage/adding-languages#lemmatizer) via the `lemma_lookup` -variable, set on the individual `Language` class. +[lookup table](/usage/adding-languages#lemmatizer) via the `resources`, set on +the individual `Language` class. > #### Example > @@ -63,10 +63,11 @@ variable, set on the individual `Language` class. > assert lemmatizer.lookup("going") == "go" > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------------------------- | -| `string` | unicode | The string to look up. | -| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string. | +| Name | Type | Description | +| ----------- | ------- | ----------------------------------------------------------------------------------------------------------- | +| `string` | unicode | The string to look up. | +| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. | +| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string. | ## Lemmatizer.is_base_form {#is_base_form tag="method"} diff --git a/website/docs/api/lookups.md b/website/docs/api/lookups.md index ab65c4a0c..9878546ea 100644 --- a/website/docs/api/lookups.md +++ b/website/docs/api/lookups.md @@ -7,10 +7,11 @@ new: 2.2 --- This class allows convenient accesss to large lookup tables and dictionaries, -e.g. lemmatization data or tokenizer exception lists. Lookups are available via -the [`Vocab`](/api/vocab) as `vocab.lookups`, so they can be accessed before the -pipeline components are applied (e.g. in the tokenizer and lemmatizer), as well -as within the pipeline components via `doc.vocab.lookups`. +e.g. lemmatization data or tokenizer exception lists using Bloom filters. +Lookups are available via the [`Vocab`](/api/vocab) as `vocab.lookups`, so they +can be accessed before the pipeline components are applied (e.g. in the +tokenizer and lemmatizer), as well as within the pipeline components via +`doc.vocab.lookups`. ## Lookups.\_\_init\_\_ {#init tag="method"} @@ -215,8 +216,11 @@ the file doesn't exist. ## Table {#table tag="class, ordererddict"} A table in the lookups. Subclass of `OrderedDict` that implements a slightly -more consistent and unified API. Supports all other methods and attributes of -`OrderedDict` / `dict`, and the customized methods listed here. +more consistent and unified API and includes a Bloom filter to speed up missed +lookups. Supports **all other methods and attributes** of `OrderedDict` / +`dict`, and the customized methods listed here. Methods that get or set keys +accept both integers and strings (which will be hashed before being added to the +table). ### Table.\_\_init\_\_ {#table.init tag="method"} @@ -226,7 +230,10 @@ Initialize a new table. > > ```python > from spacy.lookups import Table -> table = Table(name="some_table") +> data = {"foo": "bar", "baz": 100} +> table = Table(name="some_table", data=data) +> assert "foo" in table +> assert table["foo"] == "bar" > ``` | Name | Type | Description | @@ -252,9 +259,10 @@ Initialize a new table from a dict. | `name` | unicode | Optional table name for reference. | | **RETURNS** | `Table` | The newly constructed object. | -### Table.set {#table.set tag="key"} +### Table.set {#table.set tag="method"} -Set a new key / value pair. Same as `table[key] = value`. +Set a new key / value pair. String keys will be hashed. Same as +`table[key] = value`. > #### Example > @@ -265,7 +273,46 @@ Set a new key / value pair. Same as `table[key] = value`. > assert table["foo"] == "bar" > ``` -| Name | Type | Description | -| ------- | ------- | ----------- | -| `key` | unicode | The key. | -| `value` | - | The value. | +| Name | Type | Description | +| ------- | ------------- | ----------- | +| `key` | unicode / int | The key. | +| `value` | - | The value. | + +### Table.to_bytes {#table.to_bytes tag="method"} + +Serialize the table to a bytestring. + +> #### Example +> +> ```python +> table_bytes = table.to_bytes() +> ``` + +| Name | Type | Description | +| ----------- | ----- | --------------------- | +| **RETURNS** | bytes | The serialized table. | + +### Table.from_bytes {#table.from_bytes tag="method"} + +Load a table from a bytestring. + +> #### Example +> +> ```python +> table_bytes = table.to_bytes() +> table = Table() +> table.from_bytes(table_bytes) +> ``` + +| Name | Type | Description | +| ------------ | ------- | ----------------- | +| `bytes_data` | bytes | The data to load. | +| **RETURNS** | `Table` | The loaded table. | + +### Attributes {#table-attributes} + +| Name | Type | Description | +| -------------- | --------------------------- | ----------------------------------------------------- | +| `name` | unicode | Table name. | +| `default_size` | int | Default size of bloom filters if no data is provided. | +| `bloom` | `preshed.bloom.BloomFilter` | The bloom filters. | From b5d999e510ba8e0e1d8f257439354b9ecc6d5ede Mon Sep 17 00:00:00 2001 From: adrianeboyd <adrianeboyd@gmail.com> Date: Sun, 15 Sep 2019 22:31:31 +0200 Subject: [PATCH 165/207] Add textcat to train CLI (#4226) * Add doc.cats to spacy.gold at the paragraph level Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in the spacy JSON training format at the paragraph level. * `spacy.gold.docs_to_json()` writes `docs.cats` * `GoldCorpus` reads in cats in each `GoldParse` * Update instances of gold_tuples to handle cats Update iteration over gold_tuples / gold_parses to handle addition of cats at the paragraph level. * Add textcat to train CLI * Add textcat options to train CLI * Add textcat labels in `TextCategorizer.begin_training()` * Add textcat evaluation to `Scorer`: * For binary exclusive classes with provided label: F1 for label * For 2+ exclusive classes: F1 macro average * For multilabel (not exclusive): ROC AUC macro average (currently relying on sklearn) * Provide user info on textcat evaluation settings, potential incompatibilities * Provide pipeline to Scorer in `Language.evaluate` for textcat config * Customize train CLI output to include only metrics relevant to current pipeline * Add textcat evaluation to evaluate CLI * Fix handling of unset arguments and config params Fix handling of unset arguments and model confiug parameters in Scorer initialization. * Temporarily add sklearn requirement * Remove sklearn version number * Improve Scorer handling of models without textcats * Fixing Scorer handling of models without textcats * Update Scorer output for python 2.7 * Modify inf in Scorer for python 2.7 * Auto-format Also make small adjustments to make auto-formatting with black easier and produce nicer results * Move error message to Errors * Update documentation * Add cats to annotation JSON format [ci skip] * Fix tpl flag and docs [ci skip] * Switch to internal roc_auc_score Switch to internal `roc_auc_score()` adapted from scikit-learn. * Add AUCROCScore tests and improve errors/warnings * Add tests for AUCROCScore and roc_auc_score * Add missing error for only positive/negative values * Remove unnecessary warnings and errors * Make reduced roc_auc_score functions private Because most of the checks and warnings have been stripped for the internal functions and access is only intended through `ROCAUCScore`, make the functions for roc_auc_score adapted from scikit-learn private. * Check that data corresponds with multilabel flag Check that the training instances correspond with the multilabel flag, adding the multilabel flag if required. * Add textcat score to early stopping check * Add more checks to debug-data for textcat * Add example training data for textcat * Add more checks to textcat train CLI * Check configuration when extending base model * Fix typos * Update textcat example data * Provide licensing details and licenses for data * Remove two labels with no positive instances from jigsaw-toxic-comment data. Co-authored-by: Ines Montani <ines@ines.io> --- .../training/textcat_example_data/CC0.txt | 121 + .../textcat_example_data/CC_BY-SA-3.0.txt | 359 ++ .../textcat_example_data/CC_BY-SA-4.0.txt | 428 ++ .../training/textcat_example_data/README.md | 34 + .../textcat_example_data/cooking.json | 3487 +++++++++++++++++ .../textcat_example_data/cooking.jsonl | 10 + .../jigsaw-toxic-comment.json | 2987 ++++++++++++++ .../jigsaw-toxic-comment.jsonl | 10 + .../textcatjsonl_to_trainjson.py | 53 + spacy/cli/debug_data.py | 39 +- spacy/cli/evaluate.py | 1 + spacy/cli/train.py | 248 +- spacy/errors.py | 8 + spacy/gold.pyx | 23 +- spacy/language.py | 3 +- spacy/pipeline/pipes.pyx | 5 + spacy/scorer.py | 381 +- spacy/syntax/arc_eager.pyx | 1 + spacy/syntax/ner.pyx | 1 + spacy/syntax/nn_parser.pyx | 1 + spacy/tests/test_gold.py | 29 + spacy/tests/test_scorer.py | 78 +- website/docs/api/annotation.md | 4 + website/docs/api/cli.md | 12 +- website/docs/api/scorer.md | 24 +- 25 files changed, 8253 insertions(+), 94 deletions(-) create mode 100644 examples/training/textcat_example_data/CC0.txt create mode 100644 examples/training/textcat_example_data/CC_BY-SA-3.0.txt create mode 100644 examples/training/textcat_example_data/CC_BY-SA-4.0.txt create mode 100644 examples/training/textcat_example_data/README.md create mode 100644 examples/training/textcat_example_data/cooking.json create mode 100644 examples/training/textcat_example_data/cooking.jsonl create mode 100644 examples/training/textcat_example_data/jigsaw-toxic-comment.json create mode 100644 examples/training/textcat_example_data/jigsaw-toxic-comment.jsonl create mode 100644 examples/training/textcat_example_data/textcatjsonl_to_trainjson.py diff --git a/examples/training/textcat_example_data/CC0.txt b/examples/training/textcat_example_data/CC0.txt new file mode 100644 index 000000000..0e259d42c --- /dev/null +++ b/examples/training/textcat_example_data/CC0.txt @@ -0,0 +1,121 @@ +Creative Commons Legal Code + +CC0 1.0 Universal + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS + PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM + THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED + HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without fear +of later claims of infringement build upon, modify, incorporate in other +works, reuse and redistribute as freely as possible in any form whatsoever +and for any purposes, including without limitation commercial purposes. +These owners may contribute to the Commons to promote the ideal of a free +culture and the further production of creative, cultural and scientific +works, or to gain reputation or greater distribution for their Work in +part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or she +is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under its +terms, with knowledge of his or her Copyright and Related Rights in the +Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not +limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + ii. moral rights retained by the original author(s) and/or performer(s); +iii. publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + v. rights protecting the extraction, dissemination, use and reuse of data + in a Work; + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation + thereof, including any amended or successor version of such + directive); and +vii. other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention +of, applicable law, Affirmer hereby overtly, fully, permanently, +irrevocably and unconditionally waives, abandons, and surrenders all of +Affirmer's Copyright and Related Rights and associated claims and causes +of action, whether now known or unknown (including existing as well as +future claims and causes of action), in the Work (i) in all territories +worldwide, (ii) for the maximum duration provided by applicable law or +treaty (including future time extensions), (iii) in any current or future +medium and for any number of copies, and (iv) for any purpose whatsoever, +including without limitation commercial, advertising or promotional +purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each +member of the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal or +equitable action to disrupt the quiet enjoyment of the Work by the public +as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason +be judged legally invalid or ineffective under applicable law, then the +Waiver shall be preserved to the maximum extent permitted taking into +account Affirmer's express Statement of Purpose. In addition, to the +extent the Waiver is so judged Affirmer hereby grants to each affected +person a royalty-free, non transferable, non sublicensable, non exclusive, +irrevocable and unconditional license to exercise Affirmer's Copyright and +Related Rights in the Work (i) in all territories worldwide, (ii) for the +maximum duration provided by applicable law or treaty (including future +time extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"License"). The License shall be deemed effective as of the date CC0 was +applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder +of the License, and in such case Affirmer hereby affirms that he or she +will not (i) exercise any of his or her remaining Copyright and Related +Rights in the Work or (ii) assert any associated claims and causes of +action with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + b. Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, or + the present or absence of errors, whether or not discoverable, all to + the greatest extent permissible under applicable law. + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without + limitation any person's Copyright and Related Rights in the Work. + Further, Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to + this CC0 or use of the Work. diff --git a/examples/training/textcat_example_data/CC_BY-SA-3.0.txt b/examples/training/textcat_example_data/CC_BY-SA-3.0.txt new file mode 100644 index 000000000..604209a80 --- /dev/null +++ b/examples/training/textcat_example_data/CC_BY-SA-3.0.txt @@ -0,0 +1,359 @@ +Creative Commons Legal Code + +Attribution-ShareAlike 3.0 Unported + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR + DAMAGES RESULTING FROM ITS USE. + +License + +THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE +COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY +COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS +AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED. + +BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE +TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY +BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS +CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND +CONDITIONS. + +1. Definitions + + a. "Adaptation" means a work based upon the Work, or upon the Work and + other pre-existing works, such as a translation, adaptation, + derivative work, arrangement of music or other alterations of a + literary or artistic work, or phonogram or performance and includes + cinematographic adaptations or any other form in which the Work may be + recast, transformed, or adapted including in any form recognizably + derived from the original, except that a work that constitutes a + Collection will not be considered an Adaptation for the purpose of + this License. For the avoidance of doubt, where the Work is a musical + work, performance or phonogram, the synchronization of the Work in + timed-relation with a moving image ("synching") will be considered an + Adaptation for the purpose of this License. + b. "Collection" means a collection of literary or artistic works, such as + encyclopedias and anthologies, or performances, phonograms or + broadcasts, or other works or subject matter other than works listed + in Section 1(f) below, which, by reason of the selection and + arrangement of their contents, constitute intellectual creations, in + which the Work is included in its entirety in unmodified form along + with one or more other contributions, each constituting separate and + independent works in themselves, which together are assembled into a + collective whole. A work that constitutes a Collection will not be + considered an Adaptation (as defined below) for the purposes of this + License. + c. "Creative Commons Compatible License" means a license that is listed + at https://creativecommons.org/compatiblelicenses that has been + approved by Creative Commons as being essentially equivalent to this + License, including, at a minimum, because that license: (i) contains + terms that have the same purpose, meaning and effect as the License + Elements of this License; and, (ii) explicitly permits the relicensing + of adaptations of works made available under that license under this + License or a Creative Commons jurisdiction license with the same + License Elements as this License. + d. "Distribute" means to make available to the public the original and + copies of the Work or Adaptation, as appropriate, through sale or + other transfer of ownership. + e. "License Elements" means the following high-level license attributes + as selected by Licensor and indicated in the title of this License: + Attribution, ShareAlike. + f. "Licensor" means the individual, individuals, entity or entities that + offer(s) the Work under the terms of this License. + g. "Original Author" means, in the case of a literary or artistic work, + the individual, individuals, entity or entities who created the Work + or if no individual or entity can be identified, the publisher; and in + addition (i) in the case of a performance the actors, singers, + musicians, dancers, and other persons who act, sing, deliver, declaim, + play in, interpret or otherwise perform literary or artistic works or + expressions of folklore; (ii) in the case of a phonogram the producer + being the person or legal entity who first fixes the sounds of a + performance or other sounds; and, (iii) in the case of broadcasts, the + organization that transmits the broadcast. + h. "Work" means the literary and/or artistic work offered under the terms + of this License including without limitation any production in the + literary, scientific and artistic domain, whatever may be the mode or + form of its expression including digital form, such as a book, + pamphlet and other writing; a lecture, address, sermon or other work + of the same nature; a dramatic or dramatico-musical work; a + choreographic work or entertainment in dumb show; a musical + composition with or without words; a cinematographic work to which are + assimilated works expressed by a process analogous to cinematography; + a work of drawing, painting, architecture, sculpture, engraving or + lithography; a photographic work to which are assimilated works + expressed by a process analogous to photography; a work of applied + art; an illustration, map, plan, sketch or three-dimensional work + relative to geography, topography, architecture or science; a + performance; a broadcast; a phonogram; a compilation of data to the + extent it is protected as a copyrightable work; or a work performed by + a variety or circus performer to the extent it is not otherwise + considered a literary or artistic work. + i. "You" means an individual or entity exercising rights under this + License who has not previously violated the terms of this License with + respect to the Work, or who has received express permission from the + Licensor to exercise rights under this License despite a previous + violation. + j. "Publicly Perform" means to perform public recitations of the Work and + to communicate to the public those public recitations, by any means or + process, including by wire or wireless means or public digital + performances; to make available to the public Works in such a way that + members of the public may access these Works from a place and at a + place individually chosen by them; to perform the Work to the public + by any means or process and the communication to the public of the + performances of the Work, including by public digital performance; to + broadcast and rebroadcast the Work by any means including signs, + sounds or images. + k. "Reproduce" means to make copies of the Work by any means including + without limitation by sound or visual recordings and the right of + fixation and reproducing fixations of the Work, including storage of a + protected performance or phonogram in digital form or other electronic + medium. + +2. Fair Dealing Rights. Nothing in this License is intended to reduce, +limit, or restrict any uses free from copyright or rights arising from +limitations or exceptions that are provided for in connection with the +copyright protection under copyright law or other applicable laws. + +3. License Grant. Subject to the terms and conditions of this License, +Licensor hereby grants You a worldwide, royalty-free, non-exclusive, +perpetual (for the duration of the applicable copyright) license to +exercise the rights in the Work as stated below: + + a. to Reproduce the Work, to incorporate the Work into one or more + Collections, and to Reproduce the Work as incorporated in the + Collections; + b. to create and Reproduce Adaptations provided that any such Adaptation, + including any translation in any medium, takes reasonable steps to + clearly label, demarcate or otherwise identify that changes were made + to the original Work. For example, a translation could be marked "The + original work was translated from English to Spanish," or a + modification could indicate "The original work has been modified."; + c. to Distribute and Publicly Perform the Work including as incorporated + in Collections; and, + d. to Distribute and Publicly Perform Adaptations. + e. For the avoidance of doubt: + + i. Non-waivable Compulsory License Schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme cannot be waived, the Licensor + reserves the exclusive right to collect such royalties for any + exercise by You of the rights granted under this License; + ii. Waivable Compulsory License Schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme can be waived, the Licensor waives the + exclusive right to collect such royalties for any exercise by You + of the rights granted under this License; and, + iii. Voluntary License Schemes. The Licensor waives the right to + collect royalties, whether individually or, in the event that the + Licensor is a member of a collecting society that administers + voluntary licensing schemes, via that society, from any exercise + by You of the rights granted under this License. + +The above rights may be exercised in all media and formats whether now +known or hereafter devised. The above rights include the right to make +such modifications as are technically necessary to exercise the rights in +other media and formats. Subject to Section 8(f), all rights not expressly +granted by Licensor are hereby reserved. + +4. Restrictions. The license granted in Section 3 above is expressly made +subject to and limited by the following restrictions: + + a. You may Distribute or Publicly Perform the Work only under the terms + of this License. You must include a copy of, or the Uniform Resource + Identifier (URI) for, this License with every copy of the Work You + Distribute or Publicly Perform. You may not offer or impose any terms + on the Work that restrict the terms of this License or the ability of + the recipient of the Work to exercise the rights granted to that + recipient under the terms of the License. You may not sublicense the + Work. You must keep intact all notices that refer to this License and + to the disclaimer of warranties with every copy of the Work You + Distribute or Publicly Perform. When You Distribute or Publicly + Perform the Work, You may not impose any effective technological + measures on the Work that restrict the ability of a recipient of the + Work from You to exercise the rights granted to that recipient under + the terms of the License. This Section 4(a) applies to the Work as + incorporated in a Collection, but this does not require the Collection + apart from the Work itself to be made subject to the terms of this + License. If You create a Collection, upon notice from any Licensor You + must, to the extent practicable, remove from the Collection any credit + as required by Section 4(c), as requested. If You create an + Adaptation, upon notice from any Licensor You must, to the extent + practicable, remove from the Adaptation any credit as required by + Section 4(c), as requested. + b. You may Distribute or Publicly Perform an Adaptation only under the + terms of: (i) this License; (ii) a later version of this License with + the same License Elements as this License; (iii) a Creative Commons + jurisdiction license (either this or a later license version) that + contains the same License Elements as this License (e.g., + Attribution-ShareAlike 3.0 US)); (iv) a Creative Commons Compatible + License. If you license the Adaptation under one of the licenses + mentioned in (iv), you must comply with the terms of that license. If + you license the Adaptation under the terms of any of the licenses + mentioned in (i), (ii) or (iii) (the "Applicable License"), you must + comply with the terms of the Applicable License generally and the + following provisions: (I) You must include a copy of, or the URI for, + the Applicable License with every copy of each Adaptation You + Distribute or Publicly Perform; (II) You may not offer or impose any + terms on the Adaptation that restrict the terms of the Applicable + License or the ability of the recipient of the Adaptation to exercise + the rights granted to that recipient under the terms of the Applicable + License; (III) You must keep intact all notices that refer to the + Applicable License and to the disclaimer of warranties with every copy + of the Work as included in the Adaptation You Distribute or Publicly + Perform; (IV) when You Distribute or Publicly Perform the Adaptation, + You may not impose any effective technological measures on the + Adaptation that restrict the ability of a recipient of the Adaptation + from You to exercise the rights granted to that recipient under the + terms of the Applicable License. This Section 4(b) applies to the + Adaptation as incorporated in a Collection, but this does not require + the Collection apart from the Adaptation itself to be made subject to + the terms of the Applicable License. + c. If You Distribute, or Publicly Perform the Work or any Adaptations or + Collections, You must, unless a request has been made pursuant to + Section 4(a), keep intact all copyright notices for the Work and + provide, reasonable to the medium or means You are utilizing: (i) the + name of the Original Author (or pseudonym, if applicable) if supplied, + and/or if the Original Author and/or Licensor designate another party + or parties (e.g., a sponsor institute, publishing entity, journal) for + attribution ("Attribution Parties") in Licensor's copyright notice, + terms of service or by other reasonable means, the name of such party + or parties; (ii) the title of the Work if supplied; (iii) to the + extent reasonably practicable, the URI, if any, that Licensor + specifies to be associated with the Work, unless such URI does not + refer to the copyright notice or licensing information for the Work; + and (iv) , consistent with Ssection 3(b), in the case of an + Adaptation, a credit identifying the use of the Work in the Adaptation + (e.g., "French translation of the Work by Original Author," or + "Screenplay based on original Work by Original Author"). The credit + required by this Section 4(c) may be implemented in any reasonable + manner; provided, however, that in the case of a Adaptation or + Collection, at a minimum such credit will appear, if a credit for all + contributing authors of the Adaptation or Collection appears, then as + part of these credits and in a manner at least as prominent as the + credits for the other contributing authors. For the avoidance of + doubt, You may only use the credit required by this Section for the + purpose of attribution in the manner set out above and, by exercising + Your rights under this License, You may not implicitly or explicitly + assert or imply any connection with, sponsorship or endorsement by the + Original Author, Licensor and/or Attribution Parties, as appropriate, + of You or Your use of the Work, without the separate, express prior + written permission of the Original Author, Licensor and/or Attribution + Parties. + d. Except as otherwise agreed in writing by the Licensor or as may be + otherwise permitted by applicable law, if You Reproduce, Distribute or + Publicly Perform the Work either by itself or as part of any + Adaptations or Collections, You must not distort, mutilate, modify or + take other derogatory action in relation to the Work which would be + prejudicial to the Original Author's honor or reputation. Licensor + agrees that in those jurisdictions (e.g. Japan), in which any exercise + of the right granted in Section 3(b) of this License (the right to + make Adaptations) would be deemed to be a distortion, mutilation, + modification or other derogatory action prejudicial to the Original + Author's honor and reputation, the Licensor will waive or not assert, + as appropriate, this Section, to the fullest extent permitted by the + applicable national law, to enable You to reasonably exercise Your + right under Section 3(b) of this License (right to make Adaptations) + but not otherwise. + +5. Representations, Warranties and Disclaimer + +UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR +OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY +KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, +INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, +FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF +LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, +WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION +OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU. + +6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE +LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR +ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES +ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS +BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +7. Termination + + a. This License and the rights granted hereunder will terminate + automatically upon any breach by You of the terms of this License. + Individuals or entities who have received Adaptations or Collections + from You under this License, however, will not have their licenses + terminated provided such individuals or entities remain in full + compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will + survive any termination of this License. + b. Subject to the above terms and conditions, the license granted here is + perpetual (for the duration of the applicable copyright in the Work). + Notwithstanding the above, Licensor reserves the right to release the + Work under different license terms or to stop distributing the Work at + any time; provided, however that any such election will not serve to + withdraw this License (or any other license that has been, or is + required to be, granted under the terms of this License), and this + License will continue in full force and effect unless terminated as + stated above. + +8. Miscellaneous + + a. Each time You Distribute or Publicly Perform the Work or a Collection, + the Licensor offers to the recipient a license to the Work on the same + terms and conditions as the license granted to You under this License. + b. Each time You Distribute or Publicly Perform an Adaptation, Licensor + offers to the recipient a license to the original Work on the same + terms and conditions as the license granted to You under this License. + c. If any provision of this License is invalid or unenforceable under + applicable law, it shall not affect the validity or enforceability of + the remainder of the terms of this License, and without further action + by the parties to this agreement, such provision shall be reformed to + the minimum extent necessary to make such provision valid and + enforceable. + d. No term or provision of this License shall be deemed waived and no + breach consented to unless such waiver or consent shall be in writing + and signed by the party to be charged with such waiver or consent. + e. This License constitutes the entire agreement between the parties with + respect to the Work licensed here. There are no understandings, + agreements or representations with respect to the Work not specified + here. Licensor shall not be bound by any additional provisions that + may appear in any communication from You. This License may not be + modified without the mutual written agreement of the Licensor and You. + f. The rights granted under, and the subject matter referenced, in this + License were drafted utilizing the terminology of the Berne Convention + for the Protection of Literary and Artistic Works (as amended on + September 28, 1979), the Rome Convention of 1961, the WIPO Copyright + Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 + and the Universal Copyright Convention (as revised on July 24, 1971). + These rights and subject matter take effect in the relevant + jurisdiction in which the License terms are sought to be enforced + according to the corresponding provisions of the implementation of + those treaty provisions in the applicable national law. If the + standard suite of rights granted under applicable copyright law + includes additional rights not granted under this License, such + additional rights are deemed to be included in the License; this + License is not intended to restrict the license of any rights under + applicable law. + + +Creative Commons Notice + + Creative Commons is not a party to this License, and makes no warranty + whatsoever in connection with the Work. Creative Commons will not be + liable to You or any party on any legal theory for any damages + whatsoever, including without limitation any general, special, + incidental or consequential damages arising in connection to this + license. Notwithstanding the foregoing two (2) sentences, if Creative + Commons has expressly identified itself as the Licensor hereunder, it + shall have all rights and obligations of Licensor. + + Except for the limited purpose of indicating to the public that the + Work is licensed under the CCPL, Creative Commons does not authorize + the use by either party of the trademark "Creative Commons" or any + related trademark or logo of Creative Commons without the prior + written consent of Creative Commons. Any permitted use will be in + compliance with Creative Commons' then-current trademark usage + guidelines, as may be published on its website or otherwise made + available upon request from time to time. For the avoidance of doubt, + this trademark restriction does not form part of the License. + + Creative Commons may be contacted at https://creativecommons.org/. diff --git a/examples/training/textcat_example_data/CC_BY-SA-4.0.txt b/examples/training/textcat_example_data/CC_BY-SA-4.0.txt new file mode 100644 index 000000000..a73481c4b --- /dev/null +++ b/examples/training/textcat_example_data/CC_BY-SA-4.0.txt @@ -0,0 +1,428 @@ +Attribution-ShareAlike 4.0 International + +======================================================================= + +Creative Commons Corporation ("Creative Commons") is not a law firm and +does not provide legal services or legal advice. Distribution of +Creative Commons public licenses does not create a lawyer-client or +other relationship. Creative Commons makes its licenses and related +information available on an "as-is" basis. Creative Commons gives no +warranties regarding its licenses, any material licensed under their +terms and conditions, or any related information. Creative Commons +disclaims all liability for damages resulting from their use to the +fullest extent possible. + +Using Creative Commons Public Licenses + +Creative Commons public licenses provide a standard set of terms and +conditions that creators and other rights holders may use to share +original works of authorship and other material subject to copyright +and certain other rights specified in the public license below. The +following considerations are for informational purposes only, are not +exhaustive, and do not form part of our licenses. + + Considerations for licensors: Our public licenses are + intended for use by those authorized to give the public + permission to use material in ways otherwise restricted by + copyright and certain other rights. Our licenses are + irrevocable. Licensors should read and understand the terms + and conditions of the license they choose before applying it. + Licensors should also secure all rights necessary before + applying our licenses so that the public can reuse the + material as expected. Licensors should clearly mark any + material not subject to the license. This includes other CC- + licensed material, or material used under an exception or + limitation to copyright. More considerations for licensors: + wiki.creativecommons.org/Considerations_for_licensors + + Considerations for the public: By using one of our public + licenses, a licensor grants the public permission to use the + licensed material under specified terms and conditions. If + the licensor's permission is not necessary for any reason--for + example, because of any applicable exception or limitation to + copyright--then that use is not regulated by the license. Our + licenses grant only permissions under copyright and certain + other rights that a licensor has authority to grant. Use of + the licensed material may still be restricted for other + reasons, including because others have copyright or other + rights in the material. A licensor may make special requests, + such as asking that all changes be marked or described. + Although not required by our licenses, you are encouraged to + respect those requests where reasonable. More considerations + for the public: + wiki.creativecommons.org/Considerations_for_licensees + +======================================================================= + +Creative Commons Attribution-ShareAlike 4.0 International Public +License + +By exercising the Licensed Rights (defined below), You accept and agree +to be bound by the terms and conditions of this Creative Commons +Attribution-ShareAlike 4.0 International Public License ("Public +License"). To the extent this Public License may be interpreted as a +contract, You are granted the Licensed Rights in consideration of Your +acceptance of these terms and conditions, and the Licensor grants You +such rights in consideration of benefits the Licensor receives from +making the Licensed Material available under these terms and +conditions. + + +Section 1 -- Definitions. + + a. Adapted Material means material subject to Copyright and Similar + Rights that is derived from or based upon the Licensed Material + and in which the Licensed Material is translated, altered, + arranged, transformed, or otherwise modified in a manner requiring + permission under the Copyright and Similar Rights held by the + Licensor. For purposes of this Public License, where the Licensed + Material is a musical work, performance, or sound recording, + Adapted Material is always produced where the Licensed Material is + synched in timed relation with a moving image. + + b. Adapter's License means the license You apply to Your Copyright + and Similar Rights in Your contributions to Adapted Material in + accordance with the terms and conditions of this Public License. + + c. BY-SA Compatible License means a license listed at + creativecommons.org/compatiblelicenses, approved by Creative + Commons as essentially the equivalent of this Public License. + + d. Copyright and Similar Rights means copyright and/or similar rights + closely related to copyright including, without limitation, + performance, broadcast, sound recording, and Sui Generis Database + Rights, without regard to how the rights are labeled or + categorized. For purposes of this Public License, the rights + specified in Section 2(b)(1)-(2) are not Copyright and Similar + Rights. + + e. Effective Technological Measures means those measures that, in the + absence of proper authority, may not be circumvented under laws + fulfilling obligations under Article 11 of the WIPO Copyright + Treaty adopted on December 20, 1996, and/or similar international + agreements. + + f. Exceptions and Limitations means fair use, fair dealing, and/or + any other exception or limitation to Copyright and Similar Rights + that applies to Your use of the Licensed Material. + + g. License Elements means the license attributes listed in the name + of a Creative Commons Public License. The License Elements of this + Public License are Attribution and ShareAlike. + + h. Licensed Material means the artistic or literary work, database, + or other material to which the Licensor applied this Public + License. + + i. Licensed Rights means the rights granted to You subject to the + terms and conditions of this Public License, which are limited to + all Copyright and Similar Rights that apply to Your use of the + Licensed Material and that the Licensor has authority to license. + + j. Licensor means the individual(s) or entity(ies) granting rights + under this Public License. + + k. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, distribution, + dissemination, communication, or importation, and to make material + available to the public including in ways that members of the + public may access the material from a place and at a time + individually chosen by them. + + l. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and of + the Council of 11 March 1996 on the legal protection of databases, + as amended and/or succeeded, as well as other essentially + equivalent rights anywhere in the world. + + m. You means the individual or entity exercising the Licensed Rights + under this Public License. Your has a corresponding meaning. + + +Section 2 -- Scope. + + a. License grant. + + 1. Subject to the terms and conditions of this Public License, + the Licensor hereby grants You a worldwide, royalty-free, + non-sublicensable, non-exclusive, irrevocable license to + exercise the Licensed Rights in the Licensed Material to: + + a. reproduce and Share the Licensed Material, in whole or + in part; and + + b. produce, reproduce, and Share Adapted Material. + + 2. Exceptions and Limitations. For the avoidance of doubt, where + Exceptions and Limitations apply to Your use, this Public + License does not apply, and You do not need to comply with + its terms and conditions. + + 3. Term. The term of this Public License is specified in Section + 6(a). + + 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter created, + and to make technical modifications necessary to do so. The + Licensor waives and/or agrees not to assert any right or + authority to forbid You from making technical modifications + necessary to exercise the Licensed Rights, including + technical modifications necessary to circumvent Effective + Technological Measures. For purposes of this Public License, + simply making modifications authorized by this Section 2(a) + (4) never produces Adapted Material. + + 5. Downstream recipients. + + a. Offer from the Licensor -- Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + + b. Additional offer from the Licensor -- Adapted Material. + Every recipient of Adapted Material from You + automatically receives an offer from the Licensor to + exercise the Licensed Rights in the Adapted Material + under the conditions of the Adapter's License You apply. + + c. No downstream restrictions. You may not offer or impose + any additional or different terms or conditions on, or + apply any Effective Technological Measures to, the + Licensed Material if doing so restricts exercise of the + Licensed Rights by any recipient of the Licensed + Material. + + 6. No endorsement. Nothing in this Public License constitutes or + may be construed as permission to assert or imply that You + are, or that Your use of the Licensed Material is, connected + with, or sponsored, endorsed, or granted official status by, + the Licensor or others designated to receive attribution as + provided in Section 3(a)(1)(A)(i). + + b. Other rights. + + 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, to + the extent possible, the Licensor waives and/or agrees not to + assert any such rights held by the Licensor to the limited + extent necessary to allow You to exercise the Licensed + Rights, but not otherwise. + + 2. Patent and trademark rights are not licensed under this + Public License. + + 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties. + + +Section 3 -- License Conditions. + +Your exercise of the Licensed Rights is expressly made subject to the +following conditions. + + a. Attribution. + + 1. If You Share the Licensed Material (including in modified + form), You must: + + a. retain the following if it is supplied by the Licensor + with the Licensed Material: + + i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if + designated); + + ii. a copyright notice; + + iii. a notice that refers to this Public License; + + iv. a notice that refers to the disclaimer of + warranties; + + v. a URI or hyperlink to the Licensed Material to the + extent reasonably practicable; + + b. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + + c. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + + 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may be + reasonable to satisfy the conditions by providing a URI or + hyperlink to a resource that includes the required + information. + + 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + + b. ShareAlike. + + In addition to the conditions in Section 3(a), if You Share + Adapted Material You produce, the following conditions also apply. + + 1. The Adapter's License You apply must be a Creative Commons + license with the same License Elements, this version or + later, or a BY-SA Compatible License. + + 2. You must include the text of, or the URI or hyperlink to, the + Adapter's License You apply. You may satisfy this condition + in any reasonable manner based on the medium, means, and + context in which You Share Adapted Material. + + 3. You may not offer or impose any additional or different terms + or conditions on, or apply any Effective Technological + Measures to, Adapted Material that restrict exercise of the + rights granted under the Adapter's License You apply. + + +Section 4 -- Sui Generis Database Rights. + +Where the Licensed Rights include Sui Generis Database Rights that +apply to Your use of the Licensed Material: + + a. for the avoidance of doubt, Section 2(a)(1) grants You the right + to extract, reuse, reproduce, and Share all or a substantial + portion of the contents of the database; + + b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material, + + including for purposes of Section 3(b); and + c. You must comply with the conditions in Section 3(a) if You Share + all or a substantial portion of the contents of the database. + +For the avoidance of doubt, this Section 4 supplements and does not +replace Your obligations under this Public License where the Licensed +Rights include other Copyright and Similar Rights. + + +Section 5 -- Disclaimer of Warranties and Limitation of Liability. + + a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE + EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS + AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF + ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, + IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. + + b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE + TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, + NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, + INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, + COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR + USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR + DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR + IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. + + c. The disclaimer of warranties and limitation of liability provided + above shall be interpreted in a manner that, to the extent + possible, most closely approximates an absolute disclaimer and + waiver of all liability. + + +Section 6 -- Term and Termination. + + a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply with + this Public License, then Your rights under this Public License + terminate automatically. + + b. Where Your right to use the Licensed Material has terminated under + Section 6(a), it reinstates: + + 1. automatically as of the date the violation is cured, provided + it is cured within 30 days of Your discovery of the + violation; or + + 2. upon express reinstatement by the Licensor. + + For the avoidance of doubt, this Section 6(b) does not affect any + right the Licensor may have to seek remedies for Your violations + of this Public License. + + c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing so + will not terminate this Public License. + + d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + + +Section 7 -- Other Terms and Conditions. + + a. The Licensor shall not be bound by any additional or different + terms or conditions communicated by You unless expressly agreed. + + b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + + +Section 8 -- Interpretation. + + a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could lawfully + be made without permission under this Public License. + + b. To the extent possible, if any provision of this Public License is + deemed unenforceable, it shall be automatically reformed to the + minimum extent necessary to make it enforceable. If the provision + cannot be reformed, it shall be severed from this Public License + without affecting the enforceability of the remaining terms and + conditions. + + c. No term or condition of this Public License will be waived and no + failure to comply consented to unless expressly agreed to by the + Licensor. + + d. Nothing in this Public License constitutes or may be interpreted + as a limitation upon, or waiver of, any privileges and immunities + that apply to the Licensor or You, including from the legal + processes of any jurisdiction or authority. + + +======================================================================= + +Creative Commons is not a party to its public +licenses. Notwithstanding, Creative Commons may elect to apply one of +its public licenses to material it publishes and in those instances +will be considered the “Licensor.” The text of the Creative Commons +public licenses is dedicated to the public domain under the CC0 Public +Domain Dedication. Except for the limited purpose of indicating that +material is shared under a Creative Commons public license or as +otherwise permitted by the Creative Commons policies published at +creativecommons.org/policies, Creative Commons does not authorize the +use of the trademark "Creative Commons" or any other trademark or logo +of Creative Commons without its prior written consent including, +without limitation, in connection with any unauthorized modifications +to any of its public licenses or any other arrangements, +understandings, or agreements concerning use of licensed material. For +the avoidance of doubt, this paragraph does not form part of the +public licenses. + +Creative Commons may be contacted at creativecommons.org. + diff --git a/examples/training/textcat_example_data/README.md b/examples/training/textcat_example_data/README.md new file mode 100644 index 000000000..1165f0293 --- /dev/null +++ b/examples/training/textcat_example_data/README.md @@ -0,0 +1,34 @@ +## Examples of textcat training data + +spacy JSON training files were generated from JSONL with: + +``` +python textcatjsonl_to_trainjson.py -m en file.jsonl . +``` + +`cooking.json` is an example with mutually-exclusive classes with two labels: + +* `baking` +* `not_baking` + +`jigsaw-toxic-comment.json` is an example with multiple labels per instance: + +* `insult` +* `obscene` +* `severe_toxic` +* `toxic` + +### Data Sources + +* `cooking.jsonl`: https://cooking.stackexchange.com. The meta IDs link to the + original question as `https://cooking.stackexchange.com/questions/ID`, e.g., + `https://cooking.stackexchange.com/questions/2` for the first instance. +* `jigsaw-toxic-comment.jsonl`: [Jigsaw Toxic Comments Classification + Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge) + +### Data Licenses + +* `cooking.jsonl`: CC BY-SA 4.0 ([`CC_BY-SA-4.0.txt`](CC_BY-SA-4.0.txt)) +* `jigsaw-toxic-comment.jsonl`: + * text: CC BY-SA 3.0 ([`CC_BY-SA-3.0.txt`](CC_BY-SA-3.0.txt)) + * annotation: CC0 ([`CC0.txt`](CC0.txt)) diff --git a/examples/training/textcat_example_data/cooking.json b/examples/training/textcat_example_data/cooking.json new file mode 100644 index 000000000..4bad4db79 --- /dev/null +++ b/examples/training/textcat_example_data/cooking.json @@ -0,0 +1,3487 @@ +[ + { + "id":0, + "paragraphs":[ + { + "raw":"How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven by laying the strips out on a cookie sheet. When using this method, how long should I cook the bacon for, and at what temperature?\n", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"How", + "ner":"O" + }, + { + "id":1, + "orth":"should", + "ner":"O" + }, + { + "id":2, + "orth":"I", + "ner":"O" + }, + { + "id":3, + "orth":"cook", + "ner":"O" + }, + { + "id":4, + "orth":"bacon", + "ner":"O" + }, + { + "id":5, + "orth":"in", + "ner":"O" + }, + { + "id":6, + "orth":"an", + "ner":"O" + }, + { + "id":7, + "orth":"oven", + "ner":"O" + }, + { + "id":8, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":9, + "orth":"\n", + "ner":"O" + }, + { + "id":10, + "orth":"I", + "ner":"O" + }, + { + "id":11, + "orth":"'ve", + "ner":"O" + }, + { + "id":12, + "orth":"heard", + "ner":"O" + }, + { + "id":13, + "orth":"of", + "ner":"O" + }, + { + "id":14, + "orth":"people", + "ner":"O" + }, + { + "id":15, + "orth":"cooking", + "ner":"O" + }, + { + "id":16, + "orth":"bacon", + "ner":"O" + }, + { + "id":17, + "orth":"in", + "ner":"O" + }, + { + "id":18, + "orth":"an", + "ner":"O" + }, + { + "id":19, + "orth":"oven", + "ner":"O" + }, + { + "id":20, + "orth":"by", + "ner":"O" + }, + { + "id":21, + "orth":"laying", + "ner":"O" + }, + { + "id":22, + "orth":"the", + "ner":"O" + }, + { + "id":23, + "orth":"strips", + "ner":"O" + }, + { + "id":24, + "orth":"out", + "ner":"O" + }, + { + "id":25, + "orth":"on", + "ner":"O" + }, + { + "id":26, + "orth":"a", + "ner":"O" + }, + { + "id":27, + "orth":"cookie", + "ner":"O" + }, + { + "id":28, + "orth":"sheet", + "ner":"O" + }, + { + "id":29, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":30, + "orth":"When", + "ner":"O" + }, + { + "id":31, + "orth":"using", + "ner":"O" + }, + { + "id":32, + "orth":"this", + "ner":"O" + }, + { + "id":33, + "orth":"method", + "ner":"O" + }, + { + "id":34, + "orth":",", + "ner":"O" + }, + { + "id":35, + "orth":"how", + "ner":"O" + }, + { + "id":36, + "orth":"long", + "ner":"O" + }, + { + "id":37, + "orth":"should", + "ner":"O" + }, + { + "id":38, + "orth":"I", + "ner":"O" + }, + { + "id":39, + "orth":"cook", + "ner":"O" + }, + { + "id":40, + "orth":"the", + "ner":"O" + }, + { + "id":41, + "orth":"bacon", + "ner":"O" + }, + { + "id":42, + "orth":"for", + "ner":"O" + }, + { + "id":43, + "orth":",", + "ner":"O" + }, + { + "id":44, + "orth":"and", + "ner":"O" + }, + { + "id":45, + "orth":"at", + "ner":"O" + }, + { + "id":46, + "orth":"what", + "ner":"O" + }, + { + "id":47, + "orth":"temperature", + "ner":"O" + }, + { + "id":48, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":49, + "orth":"\n", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"baking", + "value":0.0 + }, + { + "label":"not_baking", + "value":1.0 + } + ] + }, + { + "raw":"What is the difference between white and brown eggs?\nI always use brown extra large eggs, but I can't honestly say why I do this other than habit at this point. Are there any distinct advantages or disadvantages like flavor, shelf life, etc?\n", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"What", + "ner":"O" + }, + { + "id":1, + "orth":"is", + "ner":"O" + }, + { + "id":2, + "orth":"the", + "ner":"O" + }, + { + "id":3, + "orth":"difference", + "ner":"O" + }, + { + "id":4, + "orth":"between", + "ner":"O" + }, + { + "id":5, + "orth":"white", + "ner":"O" + }, + { + "id":6, + "orth":"and", + "ner":"O" + }, + { + "id":7, + "orth":"brown", + "ner":"O" + }, + { + "id":8, + "orth":"eggs", + "ner":"O" + }, + { + "id":9, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":10, + "orth":"\n", + "ner":"O" + }, + { + "id":11, + "orth":"I", + "ner":"O" + }, + { + "id":12, + "orth":"always", + "ner":"O" + }, + { + "id":13, + "orth":"use", + "ner":"O" + }, + { + "id":14, + "orth":"brown", + "ner":"O" + }, + { + "id":15, + "orth":"extra", + "ner":"O" + }, + { + "id":16, + "orth":"large", + "ner":"O" + }, + { + "id":17, + "orth":"eggs", + "ner":"O" + }, + { + "id":18, + "orth":",", + "ner":"O" + }, + { + "id":19, + "orth":"but", + "ner":"O" + }, + { + "id":20, + "orth":"I", + "ner":"O" + }, + { + "id":21, + "orth":"ca", + "ner":"O" + }, + { + "id":22, + "orth":"n't", + "ner":"O" + }, + { + "id":23, + "orth":"honestly", + "ner":"O" + }, + { + "id":24, + "orth":"say", + "ner":"O" + }, + { + "id":25, + "orth":"why", + "ner":"O" + }, + { + "id":26, + "orth":"I", + "ner":"O" + }, + { + "id":27, + "orth":"do", + "ner":"O" + }, + { + "id":28, + "orth":"this", + "ner":"O" + }, + { + "id":29, + "orth":"other", + "ner":"O" + }, + { + "id":30, + "orth":"than", + "ner":"O" + }, + { + "id":31, + "orth":"habit", + "ner":"O" + }, + { + "id":32, + "orth":"at", + "ner":"O" + }, + { + "id":33, + "orth":"this", + "ner":"O" + }, + { + "id":34, + "orth":"point", + "ner":"O" + }, + { + "id":35, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":36, + "orth":"Are", + "ner":"O" + }, + { + "id":37, + "orth":"there", + "ner":"O" + }, + { + "id":38, + "orth":"any", + "ner":"O" + }, + { + "id":39, + "orth":"distinct", + "ner":"O" + }, + { + "id":40, + "orth":"advantages", + "ner":"O" + }, + { + "id":41, + "orth":"or", + "ner":"O" + }, + { + "id":42, + "orth":"disadvantages", + "ner":"O" + }, + { + "id":43, + "orth":"like", + "ner":"O" + }, + { + "id":44, + "orth":"flavor", + "ner":"O" + }, + { + "id":45, + "orth":",", + "ner":"O" + }, + { + "id":46, + "orth":"shelf", + "ner":"O" + }, + { + "id":47, + "orth":"life", + "ner":"O" + }, + { + "id":48, + "orth":",", + "ner":"O" + }, + { + "id":49, + "orth":"etc", + "ner":"O" + }, + { + "id":50, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":51, + "orth":"\n", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"baking", + "value":0.0 + }, + { + "label":"not_baking", + "value":1.0 + } + ] + }, + { + "raw":"What is the difference between baking soda and baking powder?\nAnd can I use one in place of the other in certain recipes?\n", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"What", + "ner":"O" + }, + { + "id":1, + "orth":"is", + "ner":"O" + }, + { + "id":2, + "orth":"the", + "ner":"O" + }, + { + "id":3, + "orth":"difference", + "ner":"O" + }, + { + "id":4, + "orth":"between", + "ner":"O" + }, + { + "id":5, + "orth":"baking", + "ner":"O" + }, + { + "id":6, + "orth":"soda", + "ner":"O" + }, + { + "id":7, + "orth":"and", + "ner":"O" + }, + { + "id":8, + "orth":"baking", + "ner":"O" + }, + { + "id":9, + "orth":"powder", + "ner":"O" + }, + { + "id":10, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":11, + "orth":"\n", + "ner":"O" + }, + { + "id":12, + "orth":"And", + "ner":"O" + }, + { + "id":13, + "orth":"can", + "ner":"O" + }, + { + "id":14, + "orth":"I", + "ner":"O" + }, + { + "id":15, + "orth":"use", + "ner":"O" + }, + { + "id":16, + "orth":"one", + "ner":"O" + }, + { + "id":17, + "orth":"in", + "ner":"O" + }, + { + "id":18, + "orth":"place", + "ner":"O" + }, + { + "id":19, + "orth":"of", + "ner":"O" + }, + { + "id":20, + "orth":"the", + "ner":"O" + }, + { + "id":21, + "orth":"other", + "ner":"O" + }, + { + "id":22, + "orth":"in", + "ner":"O" + }, + { + "id":23, + "orth":"certain", + "ner":"O" + }, + { + "id":24, + "orth":"recipes", + "ner":"O" + }, + { + "id":25, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":26, + "orth":"\n", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"baking", + "value":0.0 + }, + { + "label":"not_baking", + "value":1.0 + } + ] + }, + { + "raw":"In a tomato sauce recipe, how can I cut the acidity?\nIt seems that every time I make a tomato sauce for pasta, the sauce is a little bit too acid for my taste. I've tried using sugar or sodium bicarbonate, but I'm not satisfied with the results.\n", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"In", + "ner":"O" + }, + { + "id":1, + "orth":"a", + "ner":"O" + }, + { + "id":2, + "orth":"tomato", + "ner":"O" + }, + { + "id":3, + "orth":"sauce", + "ner":"O" + }, + { + "id":4, + "orth":"recipe", + "ner":"O" + }, + { + "id":5, + "orth":",", + "ner":"O" + }, + { + "id":6, + "orth":"how", + "ner":"O" + }, + { + "id":7, + "orth":"can", + "ner":"O" + }, + { + "id":8, + "orth":"I", + "ner":"O" + }, + { + "id":9, + "orth":"cut", + "ner":"O" + }, + { + "id":10, + "orth":"the", + "ner":"O" + }, + { + "id":11, + "orth":"acidity", + "ner":"O" + }, + { + "id":12, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":13, + "orth":"\n", + "ner":"O" + }, + { + "id":14, + "orth":"It", + "ner":"O" + }, + { + "id":15, + "orth":"seems", + "ner":"O" + }, + { + "id":16, + "orth":"that", + "ner":"O" + }, + { + "id":17, + "orth":"every", + "ner":"O" + }, + { + "id":18, + "orth":"time", + "ner":"O" + }, + { + "id":19, + "orth":"I", + "ner":"O" + }, + { + "id":20, + "orth":"make", + "ner":"O" + }, + { + "id":21, + "orth":"a", + "ner":"O" + }, + { + "id":22, + "orth":"tomato", + "ner":"O" + }, + { + "id":23, + "orth":"sauce", + "ner":"O" + }, + { + "id":24, + "orth":"for", + "ner":"O" + }, + { + "id":25, + "orth":"pasta", + "ner":"O" + }, + { + "id":26, + "orth":",", + "ner":"O" + }, + { + "id":27, + "orth":"the", + "ner":"O" + }, + { + "id":28, + "orth":"sauce", + "ner":"O" + }, + { + "id":29, + "orth":"is", + "ner":"O" + }, + { + "id":30, + "orth":"a", + "ner":"O" + }, + { + "id":31, + "orth":"little", + "ner":"O" + }, + { + "id":32, + "orth":"bit", + "ner":"O" + }, + { + "id":33, + "orth":"too", + "ner":"O" + }, + { + "id":34, + "orth":"acid", + "ner":"O" + }, + { + "id":35, + "orth":"for", + "ner":"O" + }, + { + "id":36, + "orth":"my", + "ner":"O" + }, + { + "id":37, + "orth":"taste", + "ner":"O" + }, + { + "id":38, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":39, + "orth":"I", + "ner":"O" + }, + { + "id":40, + "orth":"'ve", + "ner":"O" + }, + { + "id":41, + "orth":"tried", + "ner":"O" + }, + { + "id":42, + "orth":"using", + "ner":"O" + }, + { + "id":43, + "orth":"sugar", + "ner":"O" + }, + { + "id":44, + "orth":"or", + "ner":"O" + }, + { + "id":45, + "orth":"sodium", + "ner":"O" + }, + { + "id":46, + "orth":"bicarbonate", + "ner":"O" + }, + { + "id":47, + "orth":",", + "ner":"O" + }, + { + "id":48, + "orth":"but", + "ner":"O" + }, + { + "id":49, + "orth":"I", + "ner":"O" + }, + { + "id":50, + "orth":"'m", + "ner":"O" + }, + { + "id":51, + "orth":"not", + "ner":"O" + }, + { + "id":52, + "orth":"satisfied", + "ner":"O" + }, + { + "id":53, + "orth":"with", + "ner":"O" + }, + { + "id":54, + "orth":"the", + "ner":"O" + }, + { + "id":55, + "orth":"results", + "ner":"O" + }, + { + "id":56, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":57, + "orth":"\n", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"baking", + "value":0.0 + }, + { + "label":"not_baking", + "value":1.0 + } + ] + }, + { + "raw":"What ingredients (available in specific regions) can I substitute for parsley?\nI have a recipe that calls for fresh parsley. I have substituted other fresh herbs for their dried equivalents but I don't have fresh or dried parsley. Is there something else (ex another dried herb) that I can use instead of parsley?\nI know it is used mainly for looks rather than taste but I have a pasta recipe that calls for 2 tablespoons of parsley in the sauce and then another 2 tablespoons on top when it is done. I know the parsley on top is more for looks but there must be something about the taste otherwise it would call for parsley within the sauce as well.\nI would especially like to hear about substitutes available in Southeast Asia and other parts of the world where the obvious answers (such as cilantro) are not widely available.\n", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"What", + "ner":"O" + }, + { + "id":1, + "orth":"ingredients", + "ner":"O" + }, + { + "id":2, + "orth":"(", + "ner":"O" + }, + { + "id":3, + "orth":"available", + "ner":"O" + }, + { + "id":4, + "orth":"in", + "ner":"O" + }, + { + "id":5, + "orth":"specific", + "ner":"O" + }, + { + "id":6, + "orth":"regions", + "ner":"O" + }, + { + "id":7, + "orth":")", + "ner":"O" + }, + { + "id":8, + "orth":"can", + "ner":"O" + }, + { + "id":9, + "orth":"I", + "ner":"O" + }, + { + "id":10, + "orth":"substitute", + "ner":"O" + }, + { + "id":11, + "orth":"for", + "ner":"O" + }, + { + "id":12, + "orth":"parsley", + "ner":"O" + }, + { + "id":13, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":14, + "orth":"\n", + "ner":"O" + }, + { + "id":15, + "orth":"I", + "ner":"O" + }, + { + "id":16, + "orth":"have", + "ner":"O" + }, + { + "id":17, + "orth":"a", + "ner":"O" + }, + { + "id":18, + "orth":"recipe", + "ner":"O" + }, + { + "id":19, + "orth":"that", + "ner":"O" + }, + { + "id":20, + "orth":"calls", + "ner":"O" + }, + { + "id":21, + "orth":"for", + "ner":"O" + }, + { + "id":22, + "orth":"fresh", + "ner":"O" + }, + { + "id":23, + "orth":"parsley", + "ner":"O" + }, + { + "id":24, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":25, + "orth":"I", + "ner":"O" + }, + { + "id":26, + "orth":"have", + "ner":"O" + }, + { + "id":27, + "orth":"substituted", + "ner":"O" + }, + { + "id":28, + "orth":"other", + "ner":"O" + }, + { + "id":29, + "orth":"fresh", + "ner":"O" + }, + { + "id":30, + "orth":"herbs", + "ner":"O" + }, + { + "id":31, + "orth":"for", + "ner":"O" + }, + { + "id":32, + "orth":"their", + "ner":"O" + }, + { + "id":33, + "orth":"dried", + "ner":"O" + }, + { + "id":34, + "orth":"equivalents", + "ner":"O" + }, + { + "id":35, + "orth":"but", + "ner":"O" + }, + { + "id":36, + "orth":"I", + "ner":"O" + }, + { + "id":37, + "orth":"do", + "ner":"O" + }, + { + "id":38, + "orth":"n't", + "ner":"O" + }, + { + "id":39, + "orth":"have", + "ner":"O" + }, + { + "id":40, + "orth":"fresh", + "ner":"O" + }, + { + "id":41, + "orth":"or", + "ner":"O" + }, + { + "id":42, + "orth":"dried", + "ner":"O" + }, + { + "id":43, + "orth":"parsley", + "ner":"O" + }, + { + "id":44, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":45, + "orth":"Is", + "ner":"O" + }, + { + "id":46, + "orth":"there", + "ner":"O" + }, + { + "id":47, + "orth":"something", + "ner":"O" + }, + { + "id":48, + "orth":"else", + "ner":"O" + }, + { + "id":49, + "orth":"(", + "ner":"O" + }, + { + "id":50, + "orth":"ex", + "ner":"O" + }, + { + "id":51, + "orth":"another", + "ner":"O" + }, + { + "id":52, + "orth":"dried", + "ner":"O" + }, + { + "id":53, + "orth":"herb", + "ner":"O" + }, + { + "id":54, + "orth":")", + "ner":"O" + }, + { + "id":55, + "orth":"that", + "ner":"O" + }, + { + "id":56, + "orth":"I", + "ner":"O" + }, + { + "id":57, + "orth":"can", + "ner":"O" + }, + { + "id":58, + "orth":"use", + "ner":"O" + }, + { + "id":59, + "orth":"instead", + "ner":"O" + }, + { + "id":60, + "orth":"of", + "ner":"O" + }, + { + "id":61, + "orth":"parsley", + "ner":"O" + }, + { + "id":62, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":63, + "orth":"\n", + "ner":"O" + }, + { + "id":64, + "orth":"I", + "ner":"O" + }, + { + "id":65, + "orth":"know", + "ner":"O" + }, + { + "id":66, + "orth":"it", + "ner":"O" + }, + { + "id":67, + "orth":"is", + "ner":"O" + }, + { + "id":68, + "orth":"used", + "ner":"O" + }, + { + "id":69, + "orth":"mainly", + "ner":"O" + }, + { + "id":70, + "orth":"for", + "ner":"O" + }, + { + "id":71, + "orth":"looks", + "ner":"O" + }, + { + "id":72, + "orth":"rather", + "ner":"O" + }, + { + "id":73, + "orth":"than", + "ner":"O" + }, + { + "id":74, + "orth":"taste", + "ner":"O" + }, + { + "id":75, + "orth":"but", + "ner":"O" + }, + { + "id":76, + "orth":"I", + "ner":"O" + }, + { + "id":77, + "orth":"have", + "ner":"O" + }, + { + "id":78, + "orth":"a", + "ner":"O" + }, + { + "id":79, + "orth":"pasta", + "ner":"O" + }, + { + "id":80, + "orth":"recipe", + "ner":"O" + }, + { + "id":81, + "orth":"that", + "ner":"O" + }, + { + "id":82, + "orth":"calls", + "ner":"O" + }, + { + "id":83, + "orth":"for", + "ner":"O" + }, + { + "id":84, + "orth":"2", + "ner":"O" + }, + { + "id":85, + "orth":"tablespoons", + "ner":"O" + }, + { + "id":86, + "orth":"of", + "ner":"O" + }, + { + "id":87, + "orth":"parsley", + "ner":"O" + }, + { + "id":88, + "orth":"in", + "ner":"O" + }, + { + "id":89, + "orth":"the", + "ner":"O" + }, + { + "id":90, + "orth":"sauce", + "ner":"O" + }, + { + "id":91, + "orth":"and", + "ner":"O" + }, + { + "id":92, + "orth":"then", + "ner":"O" + }, + { + "id":93, + "orth":"another", + "ner":"O" + }, + { + "id":94, + "orth":"2", + "ner":"O" + }, + { + "id":95, + "orth":"tablespoons", + "ner":"O" + }, + { + "id":96, + "orth":"on", + "ner":"O" + }, + { + "id":97, + "orth":"top", + "ner":"O" + }, + { + "id":98, + "orth":"when", + "ner":"O" + }, + { + "id":99, + "orth":"it", + "ner":"O" + }, + { + "id":100, + "orth":"is", + "ner":"O" + }, + { + "id":101, + "orth":"done", + "ner":"O" + }, + { + "id":102, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":103, + "orth":"I", + "ner":"O" + }, + { + "id":104, + "orth":"know", + "ner":"O" + }, + { + "id":105, + "orth":"the", + "ner":"O" + }, + { + "id":106, + "orth":"parsley", + "ner":"O" + }, + { + "id":107, + "orth":"on", + "ner":"O" + }, + { + "id":108, + "orth":"top", + "ner":"O" + }, + { + "id":109, + "orth":"is", + "ner":"O" + }, + { + "id":110, + "orth":"more", + "ner":"O" + }, + { + "id":111, + "orth":"for", + "ner":"O" + }, + { + "id":112, + "orth":"looks", + "ner":"O" + }, + { + "id":113, + "orth":"but", + "ner":"O" + }, + { + "id":114, + "orth":"there", + "ner":"O" + }, + { + "id":115, + "orth":"must", + "ner":"O" + }, + { + "id":116, + "orth":"be", + "ner":"O" + }, + { + "id":117, + "orth":"something", + "ner":"O" + }, + { + "id":118, + "orth":"about", + "ner":"O" + }, + { + "id":119, + "orth":"the", + "ner":"O" + }, + { + "id":120, + "orth":"taste", + "ner":"O" + }, + { + "id":121, + "orth":"otherwise", + "ner":"O" + }, + { + "id":122, + "orth":"it", + "ner":"O" + }, + { + "id":123, + "orth":"would", + "ner":"O" + }, + { + "id":124, + "orth":"call", + "ner":"O" + }, + { + "id":125, + "orth":"for", + "ner":"O" + }, + { + "id":126, + "orth":"parsley", + "ner":"O" + }, + { + "id":127, + "orth":"within", + "ner":"O" + }, + { + "id":128, + "orth":"the", + "ner":"O" + }, + { + "id":129, + "orth":"sauce", + "ner":"O" + }, + { + "id":130, + "orth":"as", + "ner":"O" + }, + { + "id":131, + "orth":"well", + "ner":"O" + }, + { + "id":132, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":133, + "orth":"\n", + "ner":"O" + }, + { + "id":134, + "orth":"I", + "ner":"O" + }, + { + "id":135, + "orth":"would", + "ner":"O" + }, + { + "id":136, + "orth":"especially", + "ner":"O" + }, + { + "id":137, + "orth":"like", + "ner":"O" + }, + { + "id":138, + "orth":"to", + "ner":"O" + }, + { + "id":139, + "orth":"hear", + "ner":"O" + }, + { + "id":140, + "orth":"about", + "ner":"O" + }, + { + "id":141, + "orth":"substitutes", + "ner":"O" + }, + { + "id":142, + "orth":"available", + "ner":"O" + }, + { + "id":143, + "orth":"in", + "ner":"O" + }, + { + "id":144, + "orth":"Southeast", + "ner":"O" + }, + { + "id":145, + "orth":"Asia", + "ner":"O" + }, + { + "id":146, + "orth":"and", + "ner":"O" + }, + { + "id":147, + "orth":"other", + "ner":"O" + }, + { + "id":148, + "orth":"parts", + "ner":"O" + }, + { + "id":149, + "orth":"of", + "ner":"O" + }, + { + "id":150, + "orth":"the", + "ner":"O" + }, + { + "id":151, + "orth":"world", + "ner":"O" + }, + { + "id":152, + "orth":"where", + "ner":"O" + }, + { + "id":153, + "orth":"the", + "ner":"O" + }, + { + "id":154, + "orth":"obvious", + "ner":"O" + }, + { + "id":155, + "orth":"answers", + "ner":"O" + }, + { + "id":156, + "orth":"(", + "ner":"O" + }, + { + "id":157, + "orth":"such", + "ner":"O" + }, + { + "id":158, + "orth":"as", + "ner":"O" + }, + { + "id":159, + "orth":"cilantro", + "ner":"O" + }, + { + "id":160, + "orth":")", + "ner":"O" + }, + { + "id":161, + "orth":"are", + "ner":"O" + }, + { + "id":162, + "orth":"not", + "ner":"O" + }, + { + "id":163, + "orth":"widely", + "ner":"O" + }, + { + "id":164, + "orth":"available", + "ner":"O" + }, + { + "id":165, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":166, + "orth":"\n", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"baking", + "value":0.0 + }, + { + "label":"not_baking", + "value":1.0 + } + ] + }, + { + "raw":"What is the internal temperature a steak should be cooked to for Rare/Medium Rare/Medium/Well?\nI'd like to know when to take my steaks off the grill and please everybody.\n", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"What", + "ner":"O" + }, + { + "id":1, + "orth":"is", + "ner":"O" + }, + { + "id":2, + "orth":"the", + "ner":"O" + }, + { + "id":3, + "orth":"internal", + "ner":"O" + }, + { + "id":4, + "orth":"temperature", + "ner":"O" + }, + { + "id":5, + "orth":"a", + "ner":"O" + }, + { + "id":6, + "orth":"steak", + "ner":"O" + }, + { + "id":7, + "orth":"should", + "ner":"O" + }, + { + "id":8, + "orth":"be", + "ner":"O" + }, + { + "id":9, + "orth":"cooked", + "ner":"O" + }, + { + "id":10, + "orth":"to", + "ner":"O" + }, + { + "id":11, + "orth":"for", + "ner":"O" + }, + { + "id":12, + "orth":"Rare", + "ner":"O" + }, + { + "id":13, + "orth":"/", + "ner":"O" + }, + { + "id":14, + "orth":"Medium", + "ner":"O" + }, + { + "id":15, + "orth":"Rare", + "ner":"O" + }, + { + "id":16, + "orth":"/", + "ner":"O" + }, + { + "id":17, + "orth":"Medium", + "ner":"O" + }, + { + "id":18, + "orth":"/", + "ner":"O" + }, + { + "id":19, + "orth":"Well", + "ner":"O" + }, + { + "id":20, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":21, + "orth":"\n", + "ner":"O" + }, + { + "id":22, + "orth":"I", + "ner":"O" + }, + { + "id":23, + "orth":"'d", + "ner":"O" + }, + { + "id":24, + "orth":"like", + "ner":"O" + }, + { + "id":25, + "orth":"to", + "ner":"O" + }, + { + "id":26, + "orth":"know", + "ner":"O" + }, + { + "id":27, + "orth":"when", + "ner":"O" + }, + { + "id":28, + "orth":"to", + "ner":"O" + }, + { + "id":29, + "orth":"take", + "ner":"O" + }, + { + "id":30, + "orth":"my", + "ner":"O" + }, + { + "id":31, + "orth":"steaks", + "ner":"O" + }, + { + "id":32, + "orth":"off", + "ner":"O" + }, + { + "id":33, + "orth":"the", + "ner":"O" + }, + { + "id":34, + "orth":"grill", + "ner":"O" + }, + { + "id":35, + "orth":"and", + "ner":"O" + }, + { + "id":36, + "orth":"please", + "ner":"O" + }, + { + "id":37, + "orth":"everybody", + "ner":"O" + }, + { + "id":38, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":39, + "orth":"\n", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"baking", + "value":0.0 + }, + { + "label":"not_baking", + "value":1.0 + } + ] + }, + { + "raw":"How should I poach an egg?\nWhat's the best method to poach an egg without it turning into an eggy soupy mess?\n", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"How", + "ner":"O" + }, + { + "id":1, + "orth":"should", + "ner":"O" + }, + { + "id":2, + "orth":"I", + "ner":"O" + }, + { + "id":3, + "orth":"poach", + "ner":"O" + }, + { + "id":4, + "orth":"an", + "ner":"O" + }, + { + "id":5, + "orth":"egg", + "ner":"O" + }, + { + "id":6, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":7, + "orth":"\n", + "ner":"O" + }, + { + "id":8, + "orth":"What", + "ner":"O" + }, + { + "id":9, + "orth":"'s", + "ner":"O" + }, + { + "id":10, + "orth":"the", + "ner":"O" + }, + { + "id":11, + "orth":"best", + "ner":"O" + }, + { + "id":12, + "orth":"method", + "ner":"O" + }, + { + "id":13, + "orth":"to", + "ner":"O" + }, + { + "id":14, + "orth":"poach", + "ner":"O" + }, + { + "id":15, + "orth":"an", + "ner":"O" + }, + { + "id":16, + "orth":"egg", + "ner":"O" + }, + { + "id":17, + "orth":"without", + "ner":"O" + }, + { + "id":18, + "orth":"it", + "ner":"O" + }, + { + "id":19, + "orth":"turning", + "ner":"O" + }, + { + "id":20, + "orth":"into", + "ner":"O" + }, + { + "id":21, + "orth":"an", + "ner":"O" + }, + { + "id":22, + "orth":"eggy", + "ner":"O" + }, + { + "id":23, + "orth":"soupy", + "ner":"O" + }, + { + "id":24, + "orth":"mess", + "ner":"O" + }, + { + "id":25, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":26, + "orth":"\n", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"baking", + "value":0.0 + }, + { + "label":"not_baking", + "value":1.0 + } + ] + }, + { + "raw":"How can I make my Ice Cream \"creamier\"\nMy ice cream doesn't feel creamy enough. I got the recipe from Good Eats, and I can't tell if it's just the recipe or maybe that I'm just not getting my \"batter\" cold enough before I try to make it (I let it chill overnight in the refrigerator, but it doesn't always come out of the machine looking like \"soft serve\" as he said on the show - it's usually a little thinner).\nRecipe: http://www.foodnetwork.com/recipes/alton-brown/serious-vanilla-ice-cream-recipe/index.html\nThanks!\n", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"How", + "ner":"O" + }, + { + "id":1, + "orth":"can", + "ner":"O" + }, + { + "id":2, + "orth":"I", + "ner":"O" + }, + { + "id":3, + "orth":"make", + "ner":"O" + }, + { + "id":4, + "orth":"my", + "ner":"O" + }, + { + "id":5, + "orth":"Ice", + "ner":"O" + }, + { + "id":6, + "orth":"Cream", + "ner":"O" + }, + { + "id":7, + "orth":"\"", + "ner":"O" + }, + { + "id":8, + "orth":"creamier", + "ner":"O" + }, + { + "id":9, + "orth":"\"", + "ner":"O" + }, + { + "id":10, + "orth":"\n", + "ner":"O" + }, + { + "id":11, + "orth":"My", + "ner":"O" + }, + { + "id":12, + "orth":"ice", + "ner":"O" + }, + { + "id":13, + "orth":"cream", + "ner":"O" + }, + { + "id":14, + "orth":"does", + "ner":"O" + }, + { + "id":15, + "orth":"n't", + "ner":"O" + }, + { + "id":16, + "orth":"feel", + "ner":"O" + }, + { + "id":17, + "orth":"creamy", + "ner":"O" + }, + { + "id":18, + "orth":"enough", + "ner":"O" + }, + { + "id":19, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":20, + "orth":" ", + "ner":"O" + }, + { + "id":21, + "orth":"I", + "ner":"O" + }, + { + "id":22, + "orth":"got", + "ner":"O" + }, + { + "id":23, + "orth":"the", + "ner":"O" + }, + { + "id":24, + "orth":"recipe", + "ner":"O" + }, + { + "id":25, + "orth":"from", + "ner":"O" + }, + { + "id":26, + "orth":"Good", + "ner":"O" + }, + { + "id":27, + "orth":"Eats", + "ner":"O" + }, + { + "id":28, + "orth":",", + "ner":"O" + }, + { + "id":29, + "orth":"and", + "ner":"O" + }, + { + "id":30, + "orth":"I", + "ner":"O" + }, + { + "id":31, + "orth":"ca", + "ner":"O" + }, + { + "id":32, + "orth":"n't", + "ner":"O" + }, + { + "id":33, + "orth":"tell", + "ner":"O" + }, + { + "id":34, + "orth":"if", + "ner":"O" + }, + { + "id":35, + "orth":"it", + "ner":"O" + }, + { + "id":36, + "orth":"'s", + "ner":"O" + }, + { + "id":37, + "orth":"just", + "ner":"O" + }, + { + "id":38, + "orth":"the", + "ner":"O" + }, + { + "id":39, + "orth":"recipe", + "ner":"O" + }, + { + "id":40, + "orth":"or", + "ner":"O" + }, + { + "id":41, + "orth":"maybe", + "ner":"O" + }, + { + "id":42, + "orth":"that", + "ner":"O" + }, + { + "id":43, + "orth":"I", + "ner":"O" + }, + { + "id":44, + "orth":"'m", + "ner":"O" + }, + { + "id":45, + "orth":"just", + "ner":"O" + }, + { + "id":46, + "orth":"not", + "ner":"O" + }, + { + "id":47, + "orth":"getting", + "ner":"O" + }, + { + "id":48, + "orth":"my", + "ner":"O" + }, + { + "id":49, + "orth":"\"", + "ner":"O" + }, + { + "id":50, + "orth":"batter", + "ner":"O" + }, + { + "id":51, + "orth":"\"", + "ner":"O" + }, + { + "id":52, + "orth":"cold", + "ner":"O" + }, + { + "id":53, + "orth":"enough", + "ner":"O" + }, + { + "id":54, + "orth":"before", + "ner":"O" + }, + { + "id":55, + "orth":"I", + "ner":"O" + }, + { + "id":56, + "orth":"try", + "ner":"O" + }, + { + "id":57, + "orth":"to", + "ner":"O" + }, + { + "id":58, + "orth":"make", + "ner":"O" + }, + { + "id":59, + "orth":"it", + "ner":"O" + }, + { + "id":60, + "orth":"(", + "ner":"O" + }, + { + "id":61, + "orth":"I", + "ner":"O" + }, + { + "id":62, + "orth":"let", + "ner":"O" + }, + { + "id":63, + "orth":"it", + "ner":"O" + }, + { + "id":64, + "orth":"chill", + "ner":"O" + }, + { + "id":65, + "orth":"overnight", + "ner":"O" + }, + { + "id":66, + "orth":"in", + "ner":"O" + }, + { + "id":67, + "orth":"the", + "ner":"O" + }, + { + "id":68, + "orth":"refrigerator", + "ner":"O" + }, + { + "id":69, + "orth":",", + "ner":"O" + }, + { + "id":70, + "orth":"but", + "ner":"O" + }, + { + "id":71, + "orth":"it", + "ner":"O" + }, + { + "id":72, + "orth":"does", + "ner":"O" + }, + { + "id":73, + "orth":"n't", + "ner":"O" + }, + { + "id":74, + "orth":"always", + "ner":"O" + }, + { + "id":75, + "orth":"come", + "ner":"O" + }, + { + "id":76, + "orth":"out", + "ner":"O" + }, + { + "id":77, + "orth":"of", + "ner":"O" + }, + { + "id":78, + "orth":"the", + "ner":"O" + }, + { + "id":79, + "orth":"machine", + "ner":"O" + }, + { + "id":80, + "orth":"looking", + "ner":"O" + }, + { + "id":81, + "orth":"like", + "ner":"O" + }, + { + "id":82, + "orth":"\"", + "ner":"O" + }, + { + "id":83, + "orth":"soft", + "ner":"O" + }, + { + "id":84, + "orth":"serve", + "ner":"O" + }, + { + "id":85, + "orth":"\"", + "ner":"O" + }, + { + "id":86, + "orth":"as", + "ner":"O" + }, + { + "id":87, + "orth":"he", + "ner":"O" + }, + { + "id":88, + "orth":"said", + "ner":"O" + }, + { + "id":89, + "orth":"on", + "ner":"O" + }, + { + "id":90, + "orth":"the", + "ner":"O" + }, + { + "id":91, + "orth":"show", + "ner":"O" + }, + { + "id":92, + "orth":"-", + "ner":"O" + }, + { + "id":93, + "orth":"it", + "ner":"O" + }, + { + "id":94, + "orth":"'s", + "ner":"O" + }, + { + "id":95, + "orth":"usually", + "ner":"O" + }, + { + "id":96, + "orth":"a", + "ner":"O" + }, + { + "id":97, + "orth":"little", + "ner":"O" + }, + { + "id":98, + "orth":"thinner", + "ner":"O" + }, + { + "id":99, + "orth":")", + "ner":"O" + }, + { + "id":100, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":101, + "orth":"\n", + "ner":"O" + }, + { + "id":102, + "orth":"Recipe", + "ner":"O" + }, + { + "id":103, + "orth":":", + "ner":"O" + }, + { + "id":104, + "orth":"http://www.foodnetwork.com/recipes/alton-brown/serious-vanilla-ice-cream-recipe/index.html", + "ner":"O" + }, + { + "id":105, + "orth":"\n", + "ner":"O" + }, + { + "id":106, + "orth":"Thanks", + "ner":"O" + }, + { + "id":107, + "orth":"!", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":108, + "orth":"\n", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"baking", + "value":0.0 + }, + { + "label":"not_baking", + "value":1.0 + } + ] + }, + { + "raw":"How long and at what temperature do the various parts of a chicken need to be cooked?\nI'm interested in baking thighs, legs, breasts and wings. How long do each of these items need to bake and at what temperature?\n", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"How", + "ner":"O" + }, + { + "id":1, + "orth":"long", + "ner":"O" + }, + { + "id":2, + "orth":"and", + "ner":"O" + }, + { + "id":3, + "orth":"at", + "ner":"O" + }, + { + "id":4, + "orth":"what", + "ner":"O" + }, + { + "id":5, + "orth":"temperature", + "ner":"O" + }, + { + "id":6, + "orth":"do", + "ner":"O" + }, + { + "id":7, + "orth":"the", + "ner":"O" + }, + { + "id":8, + "orth":"various", + "ner":"O" + }, + { + "id":9, + "orth":"parts", + "ner":"O" + }, + { + "id":10, + "orth":"of", + "ner":"O" + }, + { + "id":11, + "orth":"a", + "ner":"O" + }, + { + "id":12, + "orth":"chicken", + "ner":"O" + }, + { + "id":13, + "orth":"need", + "ner":"O" + }, + { + "id":14, + "orth":"to", + "ner":"O" + }, + { + "id":15, + "orth":"be", + "ner":"O" + }, + { + "id":16, + "orth":"cooked", + "ner":"O" + }, + { + "id":17, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":18, + "orth":"\n", + "ner":"O" + }, + { + "id":19, + "orth":"I", + "ner":"O" + }, + { + "id":20, + "orth":"'m", + "ner":"O" + }, + { + "id":21, + "orth":"interested", + "ner":"O" + }, + { + "id":22, + "orth":"in", + "ner":"O" + }, + { + "id":23, + "orth":"baking", + "ner":"O" + }, + { + "id":24, + "orth":"thighs", + "ner":"O" + }, + { + "id":25, + "orth":",", + "ner":"O" + }, + { + "id":26, + "orth":"legs", + "ner":"O" + }, + { + "id":27, + "orth":",", + "ner":"O" + }, + { + "id":28, + "orth":"breasts", + "ner":"O" + }, + { + "id":29, + "orth":"and", + "ner":"O" + }, + { + "id":30, + "orth":"wings", + "ner":"O" + }, + { + "id":31, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":32, + "orth":" ", + "ner":"O" + }, + { + "id":33, + "orth":"How", + "ner":"O" + }, + { + "id":34, + "orth":"long", + "ner":"O" + }, + { + "id":35, + "orth":"do", + "ner":"O" + }, + { + "id":36, + "orth":"each", + "ner":"O" + }, + { + "id":37, + "orth":"of", + "ner":"O" + }, + { + "id":38, + "orth":"these", + "ner":"O" + }, + { + "id":39, + "orth":"items", + "ner":"O" + }, + { + "id":40, + "orth":"need", + "ner":"O" + }, + { + "id":41, + "orth":"to", + "ner":"O" + }, + { + "id":42, + "orth":"bake", + "ner":"O" + }, + { + "id":43, + "orth":"and", + "ner":"O" + }, + { + "id":44, + "orth":"at", + "ner":"O" + }, + { + "id":45, + "orth":"what", + "ner":"O" + }, + { + "id":46, + "orth":"temperature", + "ner":"O" + }, + { + "id":47, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":48, + "orth":"\n", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"baking", + "value":1.0 + }, + { + "label":"not_baking", + "value":0.0 + } + ] + }, + { + "raw":"Do I need to sift flour that is labeled sifted?\nIs there really an advantage to sifting flour that I bought that was labeled 'sifted'?\n", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"Do", + "ner":"O" + }, + { + "id":1, + "orth":"I", + "ner":"O" + }, + { + "id":2, + "orth":"need", + "ner":"O" + }, + { + "id":3, + "orth":"to", + "ner":"O" + }, + { + "id":4, + "orth":"sift", + "ner":"O" + }, + { + "id":5, + "orth":"flour", + "ner":"O" + }, + { + "id":6, + "orth":"that", + "ner":"O" + }, + { + "id":7, + "orth":"is", + "ner":"O" + }, + { + "id":8, + "orth":"labeled", + "ner":"O" + }, + { + "id":9, + "orth":"sifted", + "ner":"O" + }, + { + "id":10, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":11, + "orth":"\n", + "ner":"O" + }, + { + "id":12, + "orth":"Is", + "ner":"O" + }, + { + "id":13, + "orth":"there", + "ner":"O" + }, + { + "id":14, + "orth":"really", + "ner":"O" + }, + { + "id":15, + "orth":"an", + "ner":"O" + }, + { + "id":16, + "orth":"advantage", + "ner":"O" + }, + { + "id":17, + "orth":"to", + "ner":"O" + }, + { + "id":18, + "orth":"sifting", + "ner":"O" + }, + { + "id":19, + "orth":"flour", + "ner":"O" + }, + { + "id":20, + "orth":"that", + "ner":"O" + }, + { + "id":21, + "orth":"I", + "ner":"O" + }, + { + "id":22, + "orth":"bought", + "ner":"O" + }, + { + "id":23, + "orth":"that", + "ner":"O" + }, + { + "id":24, + "orth":"was", + "ner":"O" + }, + { + "id":25, + "orth":"labeled", + "ner":"O" + }, + { + "id":26, + "orth":"'", + "ner":"O" + }, + { + "id":27, + "orth":"sifted", + "ner":"O" + }, + { + "id":28, + "orth":"'", + "ner":"O" + }, + { + "id":29, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":30, + "orth":"\n", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"baking", + "value":1.0 + }, + { + "label":"not_baking", + "value":0.0 + } + ] + } + ] + } +] \ No newline at end of file diff --git a/examples/training/textcat_example_data/cooking.jsonl b/examples/training/textcat_example_data/cooking.jsonl new file mode 100644 index 000000000..cfdc9be87 --- /dev/null +++ b/examples/training/textcat_example_data/cooking.jsonl @@ -0,0 +1,10 @@ +{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "2"}, "text": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven by laying the strips out on a cookie sheet. When using this method, how long should I cook the bacon for, and at what temperature?\n"} +{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "3"}, "text": "What is the difference between white and brown eggs?\nI always use brown extra large eggs, but I can't honestly say why I do this other than habit at this point. Are there any distinct advantages or disadvantages like flavor, shelf life, etc?\n"} +{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "4"}, "text": "What is the difference between baking soda and baking powder?\nAnd can I use one in place of the other in certain recipes?\n"} +{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "5"}, "text": "In a tomato sauce recipe, how can I cut the acidity?\nIt seems that every time I make a tomato sauce for pasta, the sauce is a little bit too acid for my taste. I've tried using sugar or sodium bicarbonate, but I'm not satisfied with the results.\n"} +{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "6"}, "text": "What ingredients (available in specific regions) can I substitute for parsley?\nI have a recipe that calls for fresh parsley. I have substituted other fresh herbs for their dried equivalents but I don't have fresh or dried parsley. Is there something else (ex another dried herb) that I can use instead of parsley?\nI know it is used mainly for looks rather than taste but I have a pasta recipe that calls for 2 tablespoons of parsley in the sauce and then another 2 tablespoons on top when it is done. I know the parsley on top is more for looks but there must be something about the taste otherwise it would call for parsley within the sauce as well.\nI would especially like to hear about substitutes available in Southeast Asia and other parts of the world where the obvious answers (such as cilantro) are not widely available.\n"} +{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "9"}, "text": "What is the internal temperature a steak should be cooked to for Rare/Medium Rare/Medium/Well?\nI'd like to know when to take my steaks off the grill and please everybody.\n"} +{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "11"}, "text": "How should I poach an egg?\nWhat's the best method to poach an egg without it turning into an eggy soupy mess?\n"} +{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "12"}, "text": "How can I make my Ice Cream \"creamier\"\nMy ice cream doesn't feel creamy enough. I got the recipe from Good Eats, and I can't tell if it's just the recipe or maybe that I'm just not getting my \"batter\" cold enough before I try to make it (I let it chill overnight in the refrigerator, but it doesn't always come out of the machine looking like \"soft serve\" as he said on the show - it's usually a little thinner).\nRecipe: http://www.foodnetwork.com/recipes/alton-brown/serious-vanilla-ice-cream-recipe/index.html\nThanks!\n"} +{"cats": {"baking": 1.0, "not_baking": 0.0}, "meta": {"id": "17"}, "text": "How long and at what temperature do the various parts of a chicken need to be cooked?\nI'm interested in baking thighs, legs, breasts and wings. How long do each of these items need to bake and at what temperature?\n"} +{"cats": {"baking": 1.0, "not_baking": 0.0}, "meta": {"id": "27"}, "text": "Do I need to sift flour that is labeled sifted?\nIs there really an advantage to sifting flour that I bought that was labeled 'sifted'?\n"} diff --git a/examples/training/textcat_example_data/jigsaw-toxic-comment.json b/examples/training/textcat_example_data/jigsaw-toxic-comment.json new file mode 100644 index 000000000..0c8d8f8e0 --- /dev/null +++ b/examples/training/textcat_example_data/jigsaw-toxic-comment.json @@ -0,0 +1,2987 @@ +[ + { + "id":0, + "paragraphs":[ + { + "raw":"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"Explanation", + "ner":"O" + }, + { + "id":1, + "orth":"\n", + "ner":"O" + }, + { + "id":2, + "orth":"Why", + "ner":"O" + }, + { + "id":3, + "orth":"the", + "ner":"O" + }, + { + "id":4, + "orth":"edits", + "ner":"O" + }, + { + "id":5, + "orth":"made", + "ner":"O" + }, + { + "id":6, + "orth":"under", + "ner":"O" + }, + { + "id":7, + "orth":"my", + "ner":"O" + }, + { + "id":8, + "orth":"username", + "ner":"O" + }, + { + "id":9, + "orth":"Hardcore", + "ner":"O" + }, + { + "id":10, + "orth":"Metallica", + "ner":"O" + }, + { + "id":11, + "orth":"Fan", + "ner":"O" + }, + { + "id":12, + "orth":"were", + "ner":"O" + }, + { + "id":13, + "orth":"reverted", + "ner":"O" + }, + { + "id":14, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":15, + "orth":"They", + "ner":"O" + }, + { + "id":16, + "orth":"were", + "ner":"O" + }, + { + "id":17, + "orth":"n't", + "ner":"O" + }, + { + "id":18, + "orth":"vandalisms", + "ner":"O" + }, + { + "id":19, + "orth":",", + "ner":"O" + }, + { + "id":20, + "orth":"just", + "ner":"O" + }, + { + "id":21, + "orth":"closure", + "ner":"O" + }, + { + "id":22, + "orth":"on", + "ner":"O" + }, + { + "id":23, + "orth":"some", + "ner":"O" + }, + { + "id":24, + "orth":"GAs", + "ner":"O" + }, + { + "id":25, + "orth":"after", + "ner":"O" + }, + { + "id":26, + "orth":"I", + "ner":"O" + }, + { + "id":27, + "orth":"voted", + "ner":"O" + }, + { + "id":28, + "orth":"at", + "ner":"O" + }, + { + "id":29, + "orth":"New", + "ner":"O" + }, + { + "id":30, + "orth":"York", + "ner":"O" + }, + { + "id":31, + "orth":"Dolls", + "ner":"O" + }, + { + "id":32, + "orth":"FAC", + "ner":"O" + }, + { + "id":33, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":34, + "orth":"And", + "ner":"O" + }, + { + "id":35, + "orth":"please", + "ner":"O" + }, + { + "id":36, + "orth":"do", + "ner":"O" + }, + { + "id":37, + "orth":"n't", + "ner":"O" + }, + { + "id":38, + "orth":"remove", + "ner":"O" + }, + { + "id":39, + "orth":"the", + "ner":"O" + }, + { + "id":40, + "orth":"template", + "ner":"O" + }, + { + "id":41, + "orth":"from", + "ner":"O" + }, + { + "id":42, + "orth":"the", + "ner":"O" + }, + { + "id":43, + "orth":"talk", + "ner":"O" + }, + { + "id":44, + "orth":"page", + "ner":"O" + }, + { + "id":45, + "orth":"since", + "ner":"O" + }, + { + "id":46, + "orth":"I", + "ner":"O" + }, + { + "id":47, + "orth":"'m", + "ner":"O" + }, + { + "id":48, + "orth":"retired", + "ner":"O" + }, + { + "id":49, + "orth":"now.89.205.38.27", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"insult", + "value":0 + }, + { + "label":"obscene", + "value":0 + }, + { + "label":"severe_toxic", + "value":0 + }, + { + "label":"toxic", + "value":0 + } + ] + }, + { + "raw":"I'm Sorry \n\nI'm sorry I screwed around with someones talk page. It was very bad to do. I know how having the templates on their talk page helps you assert your dominance over them. I know I should bow down to the almighty administrators. But then again, I'm going to go play outside....with your mom. 76.122.79.82", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"I", + "ner":"O" + }, + { + "id":1, + "orth":"'m", + "ner":"O" + }, + { + "id":2, + "orth":"Sorry", + "ner":"O" + }, + { + "id":3, + "orth":"\n\n", + "ner":"O" + }, + { + "id":4, + "orth":"I", + "ner":"O" + }, + { + "id":5, + "orth":"'m", + "ner":"O" + }, + { + "id":6, + "orth":"sorry", + "ner":"O" + }, + { + "id":7, + "orth":"I", + "ner":"O" + }, + { + "id":8, + "orth":"screwed", + "ner":"O" + }, + { + "id":9, + "orth":"around", + "ner":"O" + }, + { + "id":10, + "orth":"with", + "ner":"O" + }, + { + "id":11, + "orth":"someones", + "ner":"O" + }, + { + "id":12, + "orth":"talk", + "ner":"O" + }, + { + "id":13, + "orth":"page", + "ner":"O" + }, + { + "id":14, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":15, + "orth":" ", + "ner":"O" + }, + { + "id":16, + "orth":"It", + "ner":"O" + }, + { + "id":17, + "orth":"was", + "ner":"O" + }, + { + "id":18, + "orth":"very", + "ner":"O" + }, + { + "id":19, + "orth":"bad", + "ner":"O" + }, + { + "id":20, + "orth":"to", + "ner":"O" + }, + { + "id":21, + "orth":"do", + "ner":"O" + }, + { + "id":22, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":23, + "orth":" ", + "ner":"O" + }, + { + "id":24, + "orth":"I", + "ner":"O" + }, + { + "id":25, + "orth":"know", + "ner":"O" + }, + { + "id":26, + "orth":"how", + "ner":"O" + }, + { + "id":27, + "orth":"having", + "ner":"O" + }, + { + "id":28, + "orth":"the", + "ner":"O" + }, + { + "id":29, + "orth":"templates", + "ner":"O" + }, + { + "id":30, + "orth":"on", + "ner":"O" + }, + { + "id":31, + "orth":"their", + "ner":"O" + }, + { + "id":32, + "orth":"talk", + "ner":"O" + }, + { + "id":33, + "orth":"page", + "ner":"O" + }, + { + "id":34, + "orth":"helps", + "ner":"O" + }, + { + "id":35, + "orth":"you", + "ner":"O" + }, + { + "id":36, + "orth":"assert", + "ner":"O" + }, + { + "id":37, + "orth":"your", + "ner":"O" + }, + { + "id":38, + "orth":"dominance", + "ner":"O" + }, + { + "id":39, + "orth":"over", + "ner":"O" + }, + { + "id":40, + "orth":"them", + "ner":"O" + }, + { + "id":41, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":42, + "orth":" ", + "ner":"O" + }, + { + "id":43, + "orth":"I", + "ner":"O" + }, + { + "id":44, + "orth":"know", + "ner":"O" + }, + { + "id":45, + "orth":"I", + "ner":"O" + }, + { + "id":46, + "orth":"should", + "ner":"O" + }, + { + "id":47, + "orth":"bow", + "ner":"O" + }, + { + "id":48, + "orth":"down", + "ner":"O" + }, + { + "id":49, + "orth":"to", + "ner":"O" + }, + { + "id":50, + "orth":"the", + "ner":"O" + }, + { + "id":51, + "orth":"almighty", + "ner":"O" + }, + { + "id":52, + "orth":"administrators", + "ner":"O" + }, + { + "id":53, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":54, + "orth":" ", + "ner":"O" + }, + { + "id":55, + "orth":"But", + "ner":"O" + }, + { + "id":56, + "orth":"then", + "ner":"O" + }, + { + "id":57, + "orth":"again", + "ner":"O" + }, + { + "id":58, + "orth":",", + "ner":"O" + }, + { + "id":59, + "orth":"I", + "ner":"O" + }, + { + "id":60, + "orth":"'m", + "ner":"O" + }, + { + "id":61, + "orth":"going", + "ner":"O" + }, + { + "id":62, + "orth":"to", + "ner":"O" + }, + { + "id":63, + "orth":"go", + "ner":"O" + }, + { + "id":64, + "orth":"play", + "ner":"O" + }, + { + "id":65, + "orth":"outside", + "ner":"O" + }, + { + "id":66, + "orth":"....", + "ner":"O" + }, + { + "id":67, + "orth":"with", + "ner":"O" + }, + { + "id":68, + "orth":"your", + "ner":"O" + }, + { + "id":69, + "orth":"mom", + "ner":"O" + }, + { + "id":70, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":71, + "orth":" ", + "ner":"O" + }, + { + "id":72, + "orth":"76.122.79.82", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"insult", + "value":0 + }, + { + "label":"obscene", + "value":0 + }, + { + "label":"severe_toxic", + "value":0 + }, + { + "label":"toxic", + "value":1 + } + ] + }, + { + "raw":"Stupid peace of shit stop deleting my stuff asshole go die and fall in a hole go to hell!", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"Stupid", + "ner":"O" + }, + { + "id":1, + "orth":"peace", + "ner":"O" + }, + { + "id":2, + "orth":"of", + "ner":"O" + }, + { + "id":3, + "orth":"shit", + "ner":"O" + }, + { + "id":4, + "orth":"stop", + "ner":"O" + }, + { + "id":5, + "orth":"deleting", + "ner":"O" + }, + { + "id":6, + "orth":"my", + "ner":"O" + }, + { + "id":7, + "orth":"stuff", + "ner":"O" + }, + { + "id":8, + "orth":"asshole", + "ner":"O" + }, + { + "id":9, + "orth":"go", + "ner":"O" + }, + { + "id":10, + "orth":"die", + "ner":"O" + }, + { + "id":11, + "orth":"and", + "ner":"O" + }, + { + "id":12, + "orth":"fall", + "ner":"O" + }, + { + "id":13, + "orth":"in", + "ner":"O" + }, + { + "id":14, + "orth":"a", + "ner":"O" + }, + { + "id":15, + "orth":"hole", + "ner":"O" + }, + { + "id":16, + "orth":"go", + "ner":"O" + }, + { + "id":17, + "orth":"to", + "ner":"O" + }, + { + "id":18, + "orth":"hell", + "ner":"O" + }, + { + "id":19, + "orth":"!", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"insult", + "value":1 + }, + { + "label":"obscene", + "value":1 + }, + { + "label":"severe_toxic", + "value":1 + }, + { + "label":"toxic", + "value":1 + } + ] + }, + { + "raw":"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"D'aww", + "ner":"O" + }, + { + "id":1, + "orth":"!", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":2, + "orth":"He", + "ner":"O" + }, + { + "id":3, + "orth":"matches", + "ner":"O" + }, + { + "id":4, + "orth":"this", + "ner":"O" + }, + { + "id":5, + "orth":"background", + "ner":"O" + }, + { + "id":6, + "orth":"colour", + "ner":"O" + }, + { + "id":7, + "orth":"I", + "ner":"O" + }, + { + "id":8, + "orth":"'m", + "ner":"O" + }, + { + "id":9, + "orth":"seemingly", + "ner":"O" + }, + { + "id":10, + "orth":"stuck", + "ner":"O" + }, + { + "id":11, + "orth":"with", + "ner":"O" + }, + { + "id":12, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":13, + "orth":"Thanks", + "ner":"O" + }, + { + "id":14, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":15, + "orth":" ", + "ner":"O" + }, + { + "id":16, + "orth":"(", + "ner":"O" + }, + { + "id":17, + "orth":"talk", + "ner":"O" + }, + { + "id":18, + "orth":")", + "ner":"O" + }, + { + "id":19, + "orth":"21:51", + "ner":"O" + }, + { + "id":20, + "orth":",", + "ner":"O" + }, + { + "id":21, + "orth":"January", + "ner":"O" + }, + { + "id":22, + "orth":"11", + "ner":"O" + }, + { + "id":23, + "orth":",", + "ner":"O" + }, + { + "id":24, + "orth":"2016", + "ner":"O" + }, + { + "id":25, + "orth":"(", + "ner":"O" + }, + { + "id":26, + "orth":"UTC", + "ner":"O" + }, + { + "id":27, + "orth":")", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"insult", + "value":0 + }, + { + "label":"obscene", + "value":0 + }, + { + "label":"severe_toxic", + "value":0 + }, + { + "label":"toxic", + "value":0 + } + ] + }, + { + "raw":"GET FUCKED UP. GET FUCKEEED UP. GOT A DRINK THAT YOU CANT PUT DOWN???/ GET FUCK UP GET FUCKED UP. I'M FUCKED UP RIGHT NOW!", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"GET", + "ner":"O" + }, + { + "id":1, + "orth":"FUCKED", + "ner":"O" + }, + { + "id":2, + "orth":"UP", + "ner":"O" + }, + { + "id":3, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":4, + "orth":"GET", + "ner":"O" + }, + { + "id":5, + "orth":"FUCKEEED", + "ner":"O" + }, + { + "id":6, + "orth":"UP", + "ner":"O" + }, + { + "id":7, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":8, + "orth":" ", + "ner":"O" + }, + { + "id":9, + "orth":"GOT", + "ner":"O" + }, + { + "id":10, + "orth":"A", + "ner":"O" + }, + { + "id":11, + "orth":"DRINK", + "ner":"O" + }, + { + "id":12, + "orth":"THAT", + "ner":"O" + }, + { + "id":13, + "orth":"YOU", + "ner":"O" + }, + { + "id":14, + "orth":"CANT", + "ner":"O" + }, + { + "id":15, + "orth":"PUT", + "ner":"O" + }, + { + "id":16, + "orth":"DOWN???/", + "ner":"O" + }, + { + "id":17, + "orth":"GET", + "ner":"O" + }, + { + "id":18, + "orth":"FUCK", + "ner":"O" + }, + { + "id":19, + "orth":"UP", + "ner":"O" + }, + { + "id":20, + "orth":"GET", + "ner":"O" + }, + { + "id":21, + "orth":"FUCKED", + "ner":"O" + }, + { + "id":22, + "orth":"UP", + "ner":"O" + }, + { + "id":23, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":24, + "orth":" ", + "ner":"O" + }, + { + "id":25, + "orth":"I'M", + "ner":"O" + }, + { + "id":26, + "orth":"FUCKED", + "ner":"O" + }, + { + "id":27, + "orth":"UP", + "ner":"O" + }, + { + "id":28, + "orth":"RIGHT", + "ner":"O" + }, + { + "id":29, + "orth":"NOW", + "ner":"O" + }, + { + "id":30, + "orth":"!", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"insult", + "value":0 + }, + { + "label":"obscene", + "value":1 + }, + { + "label":"severe_toxic", + "value":0 + }, + { + "label":"toxic", + "value":1 + } + ] + }, + { + "raw":"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"Hey", + "ner":"O" + }, + { + "id":1, + "orth":"man", + "ner":"O" + }, + { + "id":2, + "orth":",", + "ner":"O" + }, + { + "id":3, + "orth":"I", + "ner":"O" + }, + { + "id":4, + "orth":"'m", + "ner":"O" + }, + { + "id":5, + "orth":"really", + "ner":"O" + }, + { + "id":6, + "orth":"not", + "ner":"O" + }, + { + "id":7, + "orth":"trying", + "ner":"O" + }, + { + "id":8, + "orth":"to", + "ner":"O" + }, + { + "id":9, + "orth":"edit", + "ner":"O" + }, + { + "id":10, + "orth":"war", + "ner":"O" + }, + { + "id":11, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":12, + "orth":"It", + "ner":"O" + }, + { + "id":13, + "orth":"'s", + "ner":"O" + }, + { + "id":14, + "orth":"just", + "ner":"O" + }, + { + "id":15, + "orth":"that", + "ner":"O" + }, + { + "id":16, + "orth":"this", + "ner":"O" + }, + { + "id":17, + "orth":"guy", + "ner":"O" + }, + { + "id":18, + "orth":"is", + "ner":"O" + }, + { + "id":19, + "orth":"constantly", + "ner":"O" + }, + { + "id":20, + "orth":"removing", + "ner":"O" + }, + { + "id":21, + "orth":"relevant", + "ner":"O" + }, + { + "id":22, + "orth":"information", + "ner":"O" + }, + { + "id":23, + "orth":"and", + "ner":"O" + }, + { + "id":24, + "orth":"talking", + "ner":"O" + }, + { + "id":25, + "orth":"to", + "ner":"O" + }, + { + "id":26, + "orth":"me", + "ner":"O" + }, + { + "id":27, + "orth":"through", + "ner":"O" + }, + { + "id":28, + "orth":"edits", + "ner":"O" + }, + { + "id":29, + "orth":"instead", + "ner":"O" + }, + { + "id":30, + "orth":"of", + "ner":"O" + }, + { + "id":31, + "orth":"my", + "ner":"O" + }, + { + "id":32, + "orth":"talk", + "ner":"O" + }, + { + "id":33, + "orth":"page", + "ner":"O" + }, + { + "id":34, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":35, + "orth":"He", + "ner":"O" + }, + { + "id":36, + "orth":"seems", + "ner":"O" + }, + { + "id":37, + "orth":"to", + "ner":"O" + }, + { + "id":38, + "orth":"care", + "ner":"O" + }, + { + "id":39, + "orth":"more", + "ner":"O" + }, + { + "id":40, + "orth":"about", + "ner":"O" + }, + { + "id":41, + "orth":"the", + "ner":"O" + }, + { + "id":42, + "orth":"formatting", + "ner":"O" + }, + { + "id":43, + "orth":"than", + "ner":"O" + }, + { + "id":44, + "orth":"the", + "ner":"O" + }, + { + "id":45, + "orth":"actual", + "ner":"O" + }, + { + "id":46, + "orth":"info", + "ner":"O" + }, + { + "id":47, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"insult", + "value":0 + }, + { + "label":"obscene", + "value":0 + }, + { + "label":"severe_toxic", + "value":0 + }, + { + "label":"toxic", + "value":0 + } + ] + }, + { + "raw":"\"\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of \"\"types of accidents\"\" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport \"", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"\"", + "ner":"O" + }, + { + "id":1, + "orth":"\n", + "ner":"O" + }, + { + "id":2, + "orth":"More", + "ner":"O" + }, + { + "id":3, + "orth":"\n", + "ner":"O" + }, + { + "id":4, + "orth":"I", + "ner":"O" + }, + { + "id":5, + "orth":"ca", + "ner":"O" + }, + { + "id":6, + "orth":"n't", + "ner":"O" + }, + { + "id":7, + "orth":"make", + "ner":"O" + }, + { + "id":8, + "orth":"any", + "ner":"O" + }, + { + "id":9, + "orth":"real", + "ner":"O" + }, + { + "id":10, + "orth":"suggestions", + "ner":"O" + }, + { + "id":11, + "orth":"on", + "ner":"O" + }, + { + "id":12, + "orth":"improvement", + "ner":"O" + }, + { + "id":13, + "orth":"-", + "ner":"O" + }, + { + "id":14, + "orth":"I", + "ner":"O" + }, + { + "id":15, + "orth":"wondered", + "ner":"O" + }, + { + "id":16, + "orth":"if", + "ner":"O" + }, + { + "id":17, + "orth":"the", + "ner":"O" + }, + { + "id":18, + "orth":"section", + "ner":"O" + }, + { + "id":19, + "orth":"statistics", + "ner":"O" + }, + { + "id":20, + "orth":"should", + "ner":"O" + }, + { + "id":21, + "orth":"be", + "ner":"O" + }, + { + "id":22, + "orth":"later", + "ner":"O" + }, + { + "id":23, + "orth":"on", + "ner":"O" + }, + { + "id":24, + "orth":",", + "ner":"O" + }, + { + "id":25, + "orth":"or", + "ner":"O" + }, + { + "id":26, + "orth":"a", + "ner":"O" + }, + { + "id":27, + "orth":"subsection", + "ner":"O" + }, + { + "id":28, + "orth":"of", + "ner":"O" + }, + { + "id":29, + "orth":"\"", + "ner":"O" + }, + { + "id":30, + "orth":"\"", + "ner":"O" + }, + { + "id":31, + "orth":"types", + "ner":"O" + }, + { + "id":32, + "orth":"of", + "ner":"O" + }, + { + "id":33, + "orth":"accidents", + "ner":"O" + }, + { + "id":34, + "orth":"\"", + "ner":"O" + }, + { + "id":35, + "orth":"\"", + "ner":"O" + }, + { + "id":36, + "orth":" ", + "ner":"O" + }, + { + "id":37, + "orth":"-I", + "ner":"O" + }, + { + "id":38, + "orth":"think", + "ner":"O" + }, + { + "id":39, + "orth":"the", + "ner":"O" + }, + { + "id":40, + "orth":"references", + "ner":"O" + }, + { + "id":41, + "orth":"may", + "ner":"O" + }, + { + "id":42, + "orth":"need", + "ner":"O" + }, + { + "id":43, + "orth":"tidying", + "ner":"O" + }, + { + "id":44, + "orth":"so", + "ner":"O" + }, + { + "id":45, + "orth":"that", + "ner":"O" + }, + { + "id":46, + "orth":"they", + "ner":"O" + }, + { + "id":47, + "orth":"are", + "ner":"O" + }, + { + "id":48, + "orth":"all", + "ner":"O" + }, + { + "id":49, + "orth":"in", + "ner":"O" + }, + { + "id":50, + "orth":"the", + "ner":"O" + }, + { + "id":51, + "orth":"exact", + "ner":"O" + }, + { + "id":52, + "orth":"same", + "ner":"O" + }, + { + "id":53, + "orth":"format", + "ner":"O" + }, + { + "id":54, + "orth":"ie", + "ner":"O" + }, + { + "id":55, + "orth":"date", + "ner":"O" + }, + { + "id":56, + "orth":"format", + "ner":"O" + }, + { + "id":57, + "orth":"etc", + "ner":"O" + }, + { + "id":58, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":59, + "orth":"I", + "ner":"O" + }, + { + "id":60, + "orth":"can", + "ner":"O" + }, + { + "id":61, + "orth":"do", + "ner":"O" + }, + { + "id":62, + "orth":"that", + "ner":"O" + }, + { + "id":63, + "orth":"later", + "ner":"O" + }, + { + "id":64, + "orth":"on", + "ner":"O" + }, + { + "id":65, + "orth":",", + "ner":"O" + }, + { + "id":66, + "orth":"if", + "ner":"O" + }, + { + "id":67, + "orth":"no", + "ner":"O" + }, + { + "id":68, + "orth":"-", + "ner":"O" + }, + { + "id":69, + "orth":"one", + "ner":"O" + }, + { + "id":70, + "orth":"else", + "ner":"O" + }, + { + "id":71, + "orth":"does", + "ner":"O" + }, + { + "id":72, + "orth":"first", + "ner":"O" + }, + { + "id":73, + "orth":"-", + "ner":"O" + }, + { + "id":74, + "orth":"if", + "ner":"O" + }, + { + "id":75, + "orth":"you", + "ner":"O" + }, + { + "id":76, + "orth":"have", + "ner":"O" + }, + { + "id":77, + "orth":"any", + "ner":"O" + }, + { + "id":78, + "orth":"preferences", + "ner":"O" + }, + { + "id":79, + "orth":"for", + "ner":"O" + }, + { + "id":80, + "orth":"formatting", + "ner":"O" + }, + { + "id":81, + "orth":"style", + "ner":"O" + }, + { + "id":82, + "orth":"on", + "ner":"O" + }, + { + "id":83, + "orth":"references", + "ner":"O" + }, + { + "id":84, + "orth":"or", + "ner":"O" + }, + { + "id":85, + "orth":"want", + "ner":"O" + }, + { + "id":86, + "orth":"to", + "ner":"O" + }, + { + "id":87, + "orth":"do", + "ner":"O" + }, + { + "id":88, + "orth":"it", + "ner":"O" + }, + { + "id":89, + "orth":"yourself", + "ner":"O" + }, + { + "id":90, + "orth":"please", + "ner":"O" + }, + { + "id":91, + "orth":"let", + "ner":"O" + }, + { + "id":92, + "orth":"me", + "ner":"O" + }, + { + "id":93, + "orth":"know", + "ner":"O" + }, + { + "id":94, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":95, + "orth":"\n\n", + "ner":"O" + }, + { + "id":96, + "orth":"There", + "ner":"O" + }, + { + "id":97, + "orth":"appears", + "ner":"O" + }, + { + "id":98, + "orth":"to", + "ner":"O" + }, + { + "id":99, + "orth":"be", + "ner":"O" + }, + { + "id":100, + "orth":"a", + "ner":"O" + }, + { + "id":101, + "orth":"backlog", + "ner":"O" + }, + { + "id":102, + "orth":"on", + "ner":"O" + }, + { + "id":103, + "orth":"articles", + "ner":"O" + }, + { + "id":104, + "orth":"for", + "ner":"O" + }, + { + "id":105, + "orth":"review", + "ner":"O" + }, + { + "id":106, + "orth":"so", + "ner":"O" + }, + { + "id":107, + "orth":"I", + "ner":"O" + }, + { + "id":108, + "orth":"guess", + "ner":"O" + }, + { + "id":109, + "orth":"there", + "ner":"O" + }, + { + "id":110, + "orth":"may", + "ner":"O" + }, + { + "id":111, + "orth":"be", + "ner":"O" + }, + { + "id":112, + "orth":"a", + "ner":"O" + }, + { + "id":113, + "orth":"delay", + "ner":"O" + }, + { + "id":114, + "orth":"until", + "ner":"O" + }, + { + "id":115, + "orth":"a", + "ner":"O" + }, + { + "id":116, + "orth":"reviewer", + "ner":"O" + }, + { + "id":117, + "orth":"turns", + "ner":"O" + }, + { + "id":118, + "orth":"up", + "ner":"O" + }, + { + "id":119, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":120, + "orth":"It", + "ner":"O" + }, + { + "id":121, + "orth":"'s", + "ner":"O" + }, + { + "id":122, + "orth":"listed", + "ner":"O" + }, + { + "id":123, + "orth":"in", + "ner":"O" + }, + { + "id":124, + "orth":"the", + "ner":"O" + }, + { + "id":125, + "orth":"relevant", + "ner":"O" + }, + { + "id":126, + "orth":"form", + "ner":"O" + }, + { + "id":127, + "orth":"eg", + "ner":"O" + }, + { + "id":128, + "orth":"Wikipedia", + "ner":"O" + }, + { + "id":129, + "orth":":", + "ner":"O" + }, + { + "id":130, + "orth":"Good_article_nominations#Transport", + "ner":"O" + }, + { + "id":131, + "orth":" ", + "ner":"O" + }, + { + "id":132, + "orth":"\"", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"insult", + "value":0 + }, + { + "label":"obscene", + "value":0 + }, + { + "label":"severe_toxic", + "value":0 + }, + { + "label":"toxic", + "value":0 + } + ] + }, + { + "raw":"You, sir, are my hero. Any chance you remember what page that's on?", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"You", + "ner":"O" + }, + { + "id":1, + "orth":",", + "ner":"O" + }, + { + "id":2, + "orth":"sir", + "ner":"O" + }, + { + "id":3, + "orth":",", + "ner":"O" + }, + { + "id":4, + "orth":"are", + "ner":"O" + }, + { + "id":5, + "orth":"my", + "ner":"O" + }, + { + "id":6, + "orth":"hero", + "ner":"O" + }, + { + "id":7, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":8, + "orth":"Any", + "ner":"O" + }, + { + "id":9, + "orth":"chance", + "ner":"O" + }, + { + "id":10, + "orth":"you", + "ner":"O" + }, + { + "id":11, + "orth":"remember", + "ner":"O" + }, + { + "id":12, + "orth":"what", + "ner":"O" + }, + { + "id":13, + "orth":"page", + "ner":"O" + }, + { + "id":14, + "orth":"that", + "ner":"O" + }, + { + "id":15, + "orth":"'s", + "ner":"O" + }, + { + "id":16, + "orth":"on", + "ner":"O" + }, + { + "id":17, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"insult", + "value":0 + }, + { + "label":"obscene", + "value":0 + }, + { + "label":"severe_toxic", + "value":0 + }, + { + "label":"toxic", + "value":0 + } + ] + }, + { + "raw":"\"\n\nCongratulations from me as well, use the tools well. \u00a0\u00b7 talk \"", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"\"", + "ner":"O" + }, + { + "id":1, + "orth":"\n\n", + "ner":"O" + }, + { + "id":2, + "orth":"Congratulations", + "ner":"O" + }, + { + "id":3, + "orth":"from", + "ner":"O" + }, + { + "id":4, + "orth":"me", + "ner":"O" + }, + { + "id":5, + "orth":"as", + "ner":"O" + }, + { + "id":6, + "orth":"well", + "ner":"O" + }, + { + "id":7, + "orth":",", + "ner":"O" + }, + { + "id":8, + "orth":"use", + "ner":"O" + }, + { + "id":9, + "orth":"the", + "ner":"O" + }, + { + "id":10, + "orth":"tools", + "ner":"O" + }, + { + "id":11, + "orth":"well", + "ner":"O" + }, + { + "id":12, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":13, + "orth":"\u00a0", + "ner":"O" + }, + { + "id":14, + "orth":"\u00b7", + "ner":"O" + }, + { + "id":15, + "orth":"talk", + "ner":"O" + }, + { + "id":16, + "orth":"\"", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"insult", + "value":0 + }, + { + "label":"obscene", + "value":0 + }, + { + "label":"severe_toxic", + "value":0 + }, + { + "label":"toxic", + "value":0 + } + ] + }, + { + "raw":"Why can't you believe how fat Artie is? Did you see him on his recent appearence on the Tonight Show with Jay Leno? He looks absolutely AWFUL! If I had to put money on it, I'd say that Artie Lange is a can't miss candidate for the 2007 Dead pool! \n\n \nKindly keep your malicious fingers off of my above comment, . Everytime you remove it, I will repost it!!!", + "sentences":[ + { + "tokens":[ + { + "id":0, + "orth":"Why", + "ner":"O" + }, + { + "id":1, + "orth":"ca", + "ner":"O" + }, + { + "id":2, + "orth":"n't", + "ner":"O" + }, + { + "id":3, + "orth":"you", + "ner":"O" + }, + { + "id":4, + "orth":"believe", + "ner":"O" + }, + { + "id":5, + "orth":"how", + "ner":"O" + }, + { + "id":6, + "orth":"fat", + "ner":"O" + }, + { + "id":7, + "orth":"Artie", + "ner":"O" + }, + { + "id":8, + "orth":"is", + "ner":"O" + }, + { + "id":9, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":10, + "orth":"Did", + "ner":"O" + }, + { + "id":11, + "orth":"you", + "ner":"O" + }, + { + "id":12, + "orth":"see", + "ner":"O" + }, + { + "id":13, + "orth":"him", + "ner":"O" + }, + { + "id":14, + "orth":"on", + "ner":"O" + }, + { + "id":15, + "orth":"his", + "ner":"O" + }, + { + "id":16, + "orth":"recent", + "ner":"O" + }, + { + "id":17, + "orth":"appearence", + "ner":"O" + }, + { + "id":18, + "orth":"on", + "ner":"O" + }, + { + "id":19, + "orth":"the", + "ner":"O" + }, + { + "id":20, + "orth":"Tonight", + "ner":"O" + }, + { + "id":21, + "orth":"Show", + "ner":"O" + }, + { + "id":22, + "orth":"with", + "ner":"O" + }, + { + "id":23, + "orth":"Jay", + "ner":"O" + }, + { + "id":24, + "orth":"Leno", + "ner":"O" + }, + { + "id":25, + "orth":"?", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":26, + "orth":"He", + "ner":"O" + }, + { + "id":27, + "orth":"looks", + "ner":"O" + }, + { + "id":28, + "orth":"absolutely", + "ner":"O" + }, + { + "id":29, + "orth":"AWFUL", + "ner":"O" + }, + { + "id":30, + "orth":"!", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":31, + "orth":"If", + "ner":"O" + }, + { + "id":32, + "orth":"I", + "ner":"O" + }, + { + "id":33, + "orth":"had", + "ner":"O" + }, + { + "id":34, + "orth":"to", + "ner":"O" + }, + { + "id":35, + "orth":"put", + "ner":"O" + }, + { + "id":36, + "orth":"money", + "ner":"O" + }, + { + "id":37, + "orth":"on", + "ner":"O" + }, + { + "id":38, + "orth":"it", + "ner":"O" + }, + { + "id":39, + "orth":",", + "ner":"O" + }, + { + "id":40, + "orth":"I", + "ner":"O" + }, + { + "id":41, + "orth":"'d", + "ner":"O" + }, + { + "id":42, + "orth":"say", + "ner":"O" + }, + { + "id":43, + "orth":"that", + "ner":"O" + }, + { + "id":44, + "orth":"Artie", + "ner":"O" + }, + { + "id":45, + "orth":"Lange", + "ner":"O" + }, + { + "id":46, + "orth":"is", + "ner":"O" + }, + { + "id":47, + "orth":"a", + "ner":"O" + }, + { + "id":48, + "orth":"ca", + "ner":"O" + }, + { + "id":49, + "orth":"n't", + "ner":"O" + }, + { + "id":50, + "orth":"miss", + "ner":"O" + }, + { + "id":51, + "orth":"candidate", + "ner":"O" + }, + { + "id":52, + "orth":"for", + "ner":"O" + }, + { + "id":53, + "orth":"the", + "ner":"O" + }, + { + "id":54, + "orth":"2007", + "ner":"O" + }, + { + "id":55, + "orth":"Dead", + "ner":"O" + }, + { + "id":56, + "orth":"pool", + "ner":"O" + }, + { + "id":57, + "orth":"!", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":58, + "orth":" \n\n \n", + "ner":"O" + }, + { + "id":59, + "orth":"Kindly", + "ner":"O" + }, + { + "id":60, + "orth":"keep", + "ner":"O" + }, + { + "id":61, + "orth":"your", + "ner":"O" + }, + { + "id":62, + "orth":"malicious", + "ner":"O" + }, + { + "id":63, + "orth":"fingers", + "ner":"O" + }, + { + "id":64, + "orth":"off", + "ner":"O" + }, + { + "id":65, + "orth":"of", + "ner":"O" + }, + { + "id":66, + "orth":"my", + "ner":"O" + }, + { + "id":67, + "orth":"above", + "ner":"O" + }, + { + "id":68, + "orth":"comment", + "ner":"O" + }, + { + "id":69, + "orth":",", + "ner":"O" + }, + { + "id":70, + "orth":".", + "ner":"O" + } + ], + "brackets":[ + + ] + }, + { + "tokens":[ + { + "id":71, + "orth":"Everytime", + "ner":"O" + }, + { + "id":72, + "orth":"you", + "ner":"O" + }, + { + "id":73, + "orth":"remove", + "ner":"O" + }, + { + "id":74, + "orth":"it", + "ner":"O" + }, + { + "id":75, + "orth":",", + "ner":"O" + }, + { + "id":76, + "orth":"I", + "ner":"O" + }, + { + "id":77, + "orth":"will", + "ner":"O" + }, + { + "id":78, + "orth":"repost", + "ner":"O" + }, + { + "id":79, + "orth":"it", + "ner":"O" + }, + { + "id":80, + "orth":"!", + "ner":"O" + }, + { + "id":81, + "orth":"!", + "ner":"O" + }, + { + "id":82, + "orth":"!", + "ner":"O" + } + ], + "brackets":[ + + ] + } + ], + "cats":[ + { + "label":"insult", + "value":0 + }, + { + "label":"obscene", + "value":0 + }, + { + "label":"severe_toxic", + "value":0 + }, + { + "label":"toxic", + "value":1 + } + ] + } + ] + } +] \ No newline at end of file diff --git a/examples/training/textcat_example_data/jigsaw-toxic-comment.jsonl b/examples/training/textcat_example_data/jigsaw-toxic-comment.jsonl new file mode 100644 index 000000000..ac31b6255 --- /dev/null +++ b/examples/training/textcat_example_data/jigsaw-toxic-comment.jsonl @@ -0,0 +1,10 @@ +{"meta": {"id": "0000997932d777bf"}, "text": "Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 0}} +{"meta": {"id": "001956c382006abd"}, "text": "I'm Sorry \n\nI'm sorry I screwed around with someones talk page. It was very bad to do. I know how having the templates on their talk page helps you assert your dominance over them. I know I should bow down to the almighty administrators. But then again, I'm going to go play outside....with your mom. 76.122.79.82", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 1}} +{"meta": {"id": "0020e7119b96eeeb"}, "text": "Stupid peace of shit stop deleting my stuff asshole go die and fall in a hole go to hell!", "cats": {"insult": 1, "obscene": 1, "severe_toxic": 1, "toxic": 1}} +{"meta": {"id": "000103f0d9cfb60f"}, "text": "D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 0}} +{"meta": {"id": "001dc38a83d420cf"}, "text": "GET FUCKED UP. GET FUCKEEED UP. GOT A DRINK THAT YOU CANT PUT DOWN???/ GET FUCK UP GET FUCKED UP. I'M FUCKED UP RIGHT NOW!", "cats": {"insult": 0, "obscene": 1, "severe_toxic": 0, "toxic": 1}} +{"meta": {"id": "000113f07ec002fd"}, "text": "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 0}} +{"meta": {"id": "0001b41b1c6bb37e"}, "text": "\"\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of \"\"types of accidents\"\" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport \"", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 0}} +{"meta": {"id": "0001d958c54c6e35"}, "text": "You, sir, are my hero. Any chance you remember what page that's on?", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 0}} +{"meta": {"id": "00025465d4725e87"}, "text": "\"\n\nCongratulations from me as well, use the tools well. · talk \"", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 0}} +{"meta": {"id": "002264ea4d5f2887"}, "text": "Why can't you believe how fat Artie is? Did you see him on his recent appearence on the Tonight Show with Jay Leno? He looks absolutely AWFUL! If I had to put money on it, I'd say that Artie Lange is a can't miss candidate for the 2007 Dead pool! \n\n \nKindly keep your malicious fingers off of my above comment, . Everytime you remove it, I will repost it!!!", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 1}} diff --git a/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py b/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py new file mode 100644 index 000000000..339ce39be --- /dev/null +++ b/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py @@ -0,0 +1,53 @@ +from pathlib import Path +import plac +import spacy +from spacy.gold import docs_to_json +import srsly +import sys + +@plac.annotations( + model=("Model name. Defaults to 'en'.", "option", "m", str), + input_file=("Input file (jsonl)", "positional", None, Path), + output_dir=("Output directory", "positional", None, Path), + n_texts=("Number of texts to convert", "option", "t", int), +) +def convert(model='en', input_file=None, output_dir=None, n_texts=0): + # Load model with tokenizer + sentencizer only + nlp = spacy.load(model) + nlp.disable_pipes(*nlp.pipe_names) + sentencizer = nlp.create_pipe("sentencizer") + nlp.add_pipe(sentencizer, first=True) + + texts = [] + cats = [] + count = 0 + + if not input_file.exists(): + print("Input file not found:", input_file) + sys.exit(1) + else: + with open(input_file) as fileh: + for line in fileh: + data = srsly.json_loads(line) + texts.append(data["text"]) + cats.append(data["cats"]) + + if output_dir is not None: + output_dir = Path(output_dir) + if not output_dir.exists(): + output_dir.mkdir() + else: + output_dir = Path(".") + + docs = [] + for i, doc in enumerate(nlp.pipe(texts)): + doc.cats = cats[i] + docs.append(doc) + if n_texts > 0 and count == n_texts: + break + count += 1 + + srsly.write_json(output_dir / input_file.with_suffix(".json"), [docs_to_json(docs)]) + +if __name__ == "__main__": + plac.call(convert) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index aac4d5b97..979010533 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -270,7 +270,7 @@ def debug_data( if "textcat" in pipeline: msg.divider("Text Classification") - labels = [label for label in gold_train_data["textcat"]] + labels = [label for label in gold_train_data["cats"]] model_labels = _get_labels_from_model(nlp, "textcat") new_labels = [l for l in labels if l not in model_labels] existing_labels = [l for l in labels if l in model_labels] @@ -281,13 +281,44 @@ def debug_data( ) if new_labels: labels_with_counts = _format_labels( - gold_train_data["textcat"].most_common(), counts=True + gold_train_data["cats"].most_common(), counts=True ) msg.text("New: {}".format(labels_with_counts), show=verbose) if existing_labels: msg.text( "Existing: {}".format(_format_labels(existing_labels)), show=verbose ) + if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]): + msg.fail( + "The train and dev labels are not the same. " + "Train labels: {}. " + "Dev labels: {}.".format( + _format_labels(gold_train_data["cats"]), + _format_labels(gold_dev_data["cats"]), + ) + ) + if gold_train_data["n_cats_multilabel"] > 0: + msg.info("The train data contains instances without " + "mutually-exclusive classes. Use '--textcat-multilabel' " + "when training." + ) + if gold_dev_data["n_cats_multilabel"] == 0: + msg.warn( + "Potential train/dev mismatch: the train data contains " + "instances without mutually-exclusive classes while the " + "dev data does not." + ) + else: + msg.info( + "The train data contains only instances with " + "mutually-exclusive classes." + ) + if gold_dev_data["n_cats_multilabel"] > 0: + msg.fail( + "Train/dev mismatch: the dev data contains instances " + "without mutually-exclusive classes while the train data " + "contains only instances with mutually-exclusive classes." + ) if "tagger" in pipeline: msg.divider("Part-of-speech Tagging") @@ -450,6 +481,7 @@ def debug_data( ) ) + msg.divider("Summary") good_counts = msg.counts[MESSAGES.GOOD] warn_counts = msg.counts[MESSAGES.WARN] @@ -504,6 +536,7 @@ def _compile_gold(train_docs, pipeline): "n_sents": 0, "n_nonproj": 0, "n_cycles": 0, + "n_cats_multilabel": 0, "texts": set(), } for doc, gold in train_docs: @@ -526,6 +559,8 @@ def _compile_gold(train_docs, pipeline): data["ner"]["-"] += 1 if "textcat" in pipeline: data["cats"].update(gold.cats) + if list(gold.cats.values()).count(1.0) != 1: + data["n_cats_multilabel"] += 1 if "tagger" in pipeline: data["tags"].update([x for x in gold.tags if x is not None]) if "parser" in pipeline: diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 0a57ef2da..1114ada08 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -61,6 +61,7 @@ def evaluate( "NER P": "%.2f" % scorer.ents_p, "NER R": "%.2f" % scorer.ents_r, "NER F": "%.2f" % scorer.ents_f, + "Textcat": "%.2f" % scorer.textcat_score, } msg.table(results, title="Results") diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 8d162362c..784a12320 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -21,48 +21,24 @@ from .. import about @plac.annotations( + # fmt: off lang=("Model language", "positional", None, str), output_path=("Output directory to store model in", "positional", None, Path), train_path=("Location of JSON-formatted training data", "positional", None, Path), dev_path=("Location of JSON-formatted development data", "positional", None, Path), - raw_text=( - "Path to jsonl file with unlabelled text documents.", - "option", - "rt", - Path, - ), + raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path), base_model=("Name of model to update (optional)", "option", "b", str), pipeline=("Comma-separated names of pipeline components", "option", "p", str), vectors=("Model to load vectors from", "option", "v", str), n_iter=("Number of iterations", "option", "n", int), - n_early_stopping=( - "Maximum number of training epochs without dev accuracy improvement", - "option", - "ne", - int, - ), + n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int), n_examples=("Number of examples", "option", "ns", int), use_gpu=("Use GPU", "option", "g", int), version=("Model version", "option", "V", str), meta_path=("Optional path to meta.json to use as base.", "option", "m", Path), - init_tok2vec=( - "Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", - "option", - "t2v", - Path, - ), - parser_multitasks=( - "Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", - "option", - "pt", - str, - ), - entity_multitasks=( - "Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", - "option", - "et", - str, - ), + init_tok2vec=("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path), + parser_multitasks=("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str), + entity_multitasks=("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str), noise_level=("Amount of corruption for data augmentation", "option", "nl", float), orth_variant_level=( "Amount of orthography variation for data augmentation", @@ -73,8 +49,12 @@ from .. import about eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str), gold_preproc=("Use gold preprocessing", "flag", "G", bool), learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool), + textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool), + textcat_arch=("Textcat model architecture", "option", "ta", str), + textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str), verbose=("Display more information for debug", "flag", "VV", bool), debug=("Run data diagnostics before training", "flag", "D", bool), + # fmt: on ) def train( lang, @@ -99,6 +79,9 @@ def train( eval_beam_widths="", gold_preproc=False, learn_tokens=False, + textcat_multilabel=False, + textcat_arch="bow", + textcat_positive_label=None, verbose=False, debug=False, ): @@ -184,9 +167,36 @@ def train( if pipe not in nlp.pipe_names: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} + elif pipe == "textcat": + pipe_cfg = { + "exclusive_classes": not textcat_multilabel, + "architecture": textcat_arch, + "positive_label": textcat_positive_label, + } else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) + else: + if pipe == "textcat": + textcat_cfg = nlp.get_pipe("textcat").cfg + base_cfg = { + "exclusive_classes": textcat_cfg["exclusive_classes"], + "architecture": textcat_cfg["architecture"], + "positive_label": textcat_cfg["positive_label"] + } + pipe_cfg = { + "exclusive_classes": not textcat_multilabel, + "architecture": textcat_arch, + "positive_label": textcat_positive_label, + } + if base_cfg != pipe_cfg: + msg.fail("The base textcat model configuration does" + "not match the provided training options. " + "Existing cfg: {}, provided cfg: {}".format( + base_cfg, pipe_cfg + ), + exits=1 + ) else: msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) @@ -194,6 +204,12 @@ def train( for pipe in pipeline: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} + elif pipe == "textcat": + pipe_cfg = { + "exclusive_classes": not textcat_multilabel, + "architecture": textcat_arch, + "positive_label": textcat_positive_label, + } else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) @@ -234,12 +250,88 @@ def train( components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text("Loaded pretrained tok2vec for: {}".format(components)) + # Verify textcat config + if "textcat" in pipeline: + textcat_labels = nlp.get_pipe("textcat").cfg["labels"] + if textcat_positive_label and textcat_positive_label not in textcat_labels: + msg.fail( + "The textcat_positive_label (tpl) '{}' does not match any " + "label in the training data.".format(textcat_positive_label), + exits=1, + ) + if textcat_positive_label and len(textcat_labels) != 2: + msg.fail( + "A textcat_positive_label (tpl) '{}' was provided for training " + "data that does not appear to be a binary classification " + "problem with two labels.".format(textcat_positive_label), + exits=1, + ) + train_docs = corpus.train_docs( + nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0 + ) + train_labels = set() + if textcat_multilabel: + multilabel_found = False + for text, gold in train_docs: + train_labels.update(gold.cats.keys()) + if list(gold.cats.values()).count(1.0) != 1: + multilabel_found = True + if not multilabel_found and not base_model: + msg.warn( + "The textcat training instances look like they have " + "mutually-exclusive classes. Remove the flag " + "'--textcat-multilabel' to train a classifier with " + "mutually-exclusive classes." + ) + if not textcat_multilabel: + for text, gold in train_docs: + train_labels.update(gold.cats.keys()) + if list(gold.cats.values()).count(1.0) != 1 and not base_model: + msg.warn( + "Some textcat training instances do not have exactly " + "one positive label. Modifying training options to " + "include the flag '--textcat-multilabel' for classes " + "that are not mutually exclusive." + ) + nlp.get_pipe("textcat").cfg["exclusive_classes"] = False + textcat_multilabel = True + break + if base_model and set(textcat_labels) != train_labels: + msg.fail( + "Cannot extend textcat model using data with different " + "labels. Base model labels: {}, training data labels: " + "{}.".format(textcat_labels, list(train_labels)), exits=1 + ) + if textcat_multilabel: + msg.text( + "Textcat evaluation score: ROC AUC score macro-averaged across " + "the labels '{}'".format(", ".join(textcat_labels)) + ) + elif textcat_positive_label and len(textcat_labels) == 2: + msg.text( + "Textcat evaluation score: F1-score for the " + "label '{}'".format(textcat_positive_label) + ) + elif len(textcat_labels) > 1: + if len(textcat_labels) == 2: + msg.warn( + "If the textcat component is a binary classifier with " + "exclusive classes, provide '--textcat_positive_label' for " + "an evaluation on the positive class." + ) + msg.text( + "Textcat evaluation score: F1-score macro-averaged across " + "the labels '{}'".format(", ".join(textcat_labels)) + ) + else: + msg.fail( + "Unsupported textcat configuration. Use `spacy debug-data` " + "for more information." + ) + # fmt: off - row_head = ["Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS"] - row_widths = [3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7] - if has_beam_widths: - row_head.insert(1, "Beam W.") - row_widths.insert(1, 7) + row_head, output_stats = _configure_training_output(pipeline, use_gpu, has_beam_widths) + row_widths = [len(w) for w in row_head] row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2} # fmt: on print("") @@ -297,7 +389,7 @@ def train( ) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() - scorer = nlp_loaded.evaluate(dev_docs, debug) + scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() if use_gpu < 0: gpu_wps = None @@ -313,7 +405,7 @@ def train( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc) ) start_time = timer() - scorer = nlp_loaded.evaluate(dev_docs) + scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" @@ -355,10 +447,19 @@ def train( i, losses, scorer.scores, + output_stats, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) + if i == 0 and "textcat" in pipeline: + textcats_per_cat = scorer.scores.get("textcats_per_cat", {}) + for cat, cat_score in textcats_per_cat.items(): + if cat_score.get("roc_auc_score", 0) < 0: + msg.warn( + "Textcat ROC AUC score is undefined due to " + "only one value in label '{}'.".format(cat) + ) msg.row(progress, **row_settings) # Early stopping if n_early_stopping is not None: @@ -399,6 +500,8 @@ def _score_for_model(meta): mean_acc.append((acc["uas"] + acc["las"]) / 2) if "ner" in pipes: mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3) + if "textcat" in pipes: + mean_acc.append(acc["textcat_score"]) return sum(mean_acc) / len(mean_acc) @@ -482,40 +585,55 @@ def _get_metrics(component): return ("token_acc",) -def _get_progress(itn, losses, dev_scores, beam_width=None, cpu_wps=0.0, gpu_wps=0.0): +def _configure_training_output(pipeline, use_gpu, has_beam_widths): + row_head = ["Itn"] + output_stats = [] + for pipe in pipeline: + if pipe == "tagger": + row_head.extend(["Tag Loss ", " Tag % "]) + output_stats.extend(["tag_loss", "tags_acc"]) + elif pipe == "parser": + row_head.extend(["Dep Loss ", " UAS ", " LAS "]) + output_stats.extend(["dep_loss", "uas", "las"]) + elif pipe == "ner": + row_head.extend(["NER Loss ", "NER P ", "NER R ", "NER F "]) + output_stats.extend(["ner_loss", "ents_p", "ents_r", "ents_f"]) + elif pipe == "textcat": + row_head.extend(["Textcat Loss", "Textcat"]) + output_stats.extend(["textcat_loss", "textcat_score"]) + row_head.extend(["Token %", "CPU WPS"]) + output_stats.extend(["token_acc", "cpu_wps"]) + + if use_gpu >= 0: + row_head.extend(["GPU WPS"]) + output_stats.extend(["gpu_wps"]) + + if has_beam_widths: + row_head.insert(1, "Beam W.") + return row_head, output_stats + + +def _get_progress( + itn, losses, dev_scores, output_stats, beam_width=None, cpu_wps=0.0, gpu_wps=0.0 +): scores = {} - for col in [ - "dep_loss", - "tag_loss", - "uas", - "tags_acc", - "token_acc", - "ents_p", - "ents_r", - "ents_f", - "cpu_wps", - "gpu_wps", - ]: - scores[col] = 0.0 + for stat in output_stats: + scores[stat] = 0.0 scores["dep_loss"] = losses.get("parser", 0.0) scores["ner_loss"] = losses.get("ner", 0.0) scores["tag_loss"] = losses.get("tagger", 0.0) - scores.update(dev_scores) + scores["textcat_loss"] = losses.get("textcat", 0.0) scores["cpu_wps"] = cpu_wps scores["gpu_wps"] = gpu_wps or 0.0 - result = [ - itn, - "{:.3f}".format(scores["dep_loss"]), - "{:.3f}".format(scores["ner_loss"]), - "{:.3f}".format(scores["uas"]), - "{:.3f}".format(scores["ents_p"]), - "{:.3f}".format(scores["ents_r"]), - "{:.3f}".format(scores["ents_f"]), - "{:.3f}".format(scores["tags_acc"]), - "{:.3f}".format(scores["token_acc"]), - "{:.0f}".format(scores["cpu_wps"]), - "{:.0f}".format(scores["gpu_wps"]), - ] + scores.update(dev_scores) + formatted_scores = [] + for stat in output_stats: + format_spec = "{:.3f}" + if stat.endswith("_wps"): + format_spec = "{:.0f}" + formatted_scores.append(format_spec.format(scores[stat])) + result = [itn + 1] + result.extend(formatted_scores) if beam_width is not None: result.insert(1, beam_width) return result diff --git a/spacy/errors.py b/spacy/errors.py index 587a6e700..80c4c6f85 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -457,6 +457,14 @@ class Errors(object): E160 = ("Can't find language data file: {path}") E161 = ("Found an internal inconsistency when predicting entity links. " "This is likely a bug in spaCy, so feel free to open an issue.") + E162 = ("Cannot evaluate textcat model on data with different labels.\n" + "Labels in model: {model_labels}\nLabels in evaluation " + "data: {eval_labels}") + E163 = ("cumsum was found to be unstable: its last element does not " + "correspond to sum") + E164 = ("x is neither increasing nor decreasing: {}.") + E165 = ("Only one class present in y_true. ROC AUC score is not defined in " + "that case.") @add_codes diff --git a/spacy/gold.pyx b/spacy/gold.pyx index af0588349..4eaea80ed 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -57,6 +57,7 @@ def tags_to_entities(tags): def merge_sents(sents): m_deps = [[], [], [], [], [], []] m_brackets = [] + m_cats = sents.pop() i = 0 for (ids, words, tags, heads, labels, ner), brackets in sents: m_deps[0].extend(id_ + i for id_ in ids) @@ -68,6 +69,7 @@ def merge_sents(sents): m_brackets.extend((b["first"] + i, b["last"] + i, b["label"]) for b in brackets) i += len(ids) + m_deps.append(m_cats) return [(m_deps, m_brackets)] @@ -199,6 +201,7 @@ class GoldCorpus(object): n = 0 i = 0 for raw_text, paragraph_tuples in self.train_tuples: + cats = paragraph_tuples.pop() for sent_tuples, brackets in paragraph_tuples: n += len(sent_tuples[1]) if self.limit and i >= self.limit: @@ -260,11 +263,7 @@ class GoldCorpus(object): if len(docs) != len(paragraph_tuples): n_annots = len(paragraph_tuples) raise ValueError(Errors.E070.format(n_docs=len(docs), n_annots=n_annots)) - if len(docs) == 1: - return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0], - make_projective=make_projective)] - else: - return [GoldParse.from_annot_tuples(doc, sent_tuples, + return [GoldParse.from_annot_tuples(doc, sent_tuples, make_projective=make_projective) for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)] @@ -415,6 +414,10 @@ def json_to_tuple(doc): sents.append([ [ids, words, tags, heads, labels, ner], sent.get("brackets", [])]) + cats = {} + for cat in paragraph.get("cats", {}): + cats[cat["label"]] = cat["value"] + sents.append(cats) if sents: yield [paragraph.get("raw", None), sents] @@ -528,9 +531,10 @@ cdef class GoldParse: """ @classmethod def from_annot_tuples(cls, doc, annot_tuples, make_projective=False): - _, words, tags, heads, deps, entities = annot_tuples + _, words, tags, heads, deps, entities, cats = annot_tuples return cls(doc, words=words, tags=tags, heads=heads, deps=deps, - entities=entities, make_projective=make_projective) + entities=entities, cats=cats, + make_projective=make_projective) def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None, heads=None, deps=None, entities=None, make_projective=False, @@ -739,7 +743,10 @@ def docs_to_json(docs, id=0): docs = [docs] json_doc = {"id": id, "paragraphs": []} for i, doc in enumerate(docs): - json_para = {'raw': doc.text, "sentences": []} + json_para = {'raw': doc.text, "sentences": [], "cats": []} + for cat, val in doc.cats.items(): + json_cat = {"label": cat, "value": val} + json_para["cats"].append(json_cat) ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] biluo_tags = biluo_tags_from_offsets(doc, ent_offsets) for j, sent in enumerate(doc.sents): diff --git a/spacy/language.py b/spacy/language.py index f966a6630..4f94c39f6 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -591,6 +591,7 @@ class Language(object): # Populate vocab else: for _, annots_brackets in get_gold_tuples(): + _ = annots_brackets.pop() for annots, _ in annots_brackets: for word in annots[1]: _ = self.vocab[word] # noqa: F841 @@ -659,7 +660,7 @@ class Language(object): DOCS: https://spacy.io/api/language#evaluate """ if scorer is None: - scorer = Scorer() + scorer = Scorer(pipeline=self.pipeline) if component_cfg is None: component_cfg = {} docs, golds = zip(*docs_golds) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 3e4b3582c..9d929f6b7 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -504,6 +504,7 @@ class Tagger(Pipe): orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = OrderedDict() for raw_text, annots_brackets in get_gold_tuples(): + _ = annots_brackets.pop() for annots, brackets in annots_brackets: ids, words, tags, heads, deps, ents = annots for tag in tags: @@ -1021,6 +1022,10 @@ class TextCategorizer(Pipe): return 1 def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): + for raw_text, annots_brackets in get_gold_tuples(): + cats = annots_brackets.pop() + for cat in cats: + self.add_label(cat) if self.model is True: self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors") self.require_labels() diff --git a/spacy/scorer.py b/spacy/scorer.py index 4032cc4dd..30ed389cd 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -1,7 +1,10 @@ # coding: utf8 from __future__ import division, print_function, unicode_literals +import numpy as np + from .gold import tags_to_entities, GoldParse +from .errors import Errors class PRFScore(object): @@ -34,10 +37,39 @@ class PRFScore(object): return 2 * ((p * r) / (p + r + 1e-100)) +class ROCAUCScore(object): + """ + An AUC ROC score. + """ + + def __init__(self): + self.golds = [] + self.cands = [] + self.saved_score = 0.0 + self.saved_score_at_len = 0 + + def score_set(self, cand, gold): + self.cands.append(cand) + self.golds.append(gold) + + @property + def score(self): + if len(self.golds) == self.saved_score_at_len: + return self.saved_score + try: + self.saved_score = _roc_auc_score(self.golds, self.cands) + # catch ValueError: Only one class present in y_true. + # ROC AUC score is not defined in that case. + except: + self.saved_score = -float("inf") + self.saved_score_at_len = len(self.golds) + return self.saved_score + + class Scorer(object): """Compute evaluation scores.""" - def __init__(self, eval_punct=False): + def __init__(self, eval_punct=False, pipeline=None): """Initialize the Scorer. eval_punct (bool): Evaluate the dependency attachments to and from @@ -54,6 +86,24 @@ class Scorer(object): self.ner = PRFScore() self.ner_per_ents = dict() self.eval_punct = eval_punct + self.textcat = None + self.textcat_per_cat = dict() + self.textcat_positive_label = None + self.textcat_multilabel = False + + if pipeline: + for name, model in pipeline: + if name == "textcat": + self.textcat_positive_label = model.cfg.get("positive_label", None) + if self.textcat_positive_label: + self.textcat = PRFScore() + if not model.cfg.get("exclusive_classes", False): + self.textcat_multilabel = True + for label in model.cfg.get("labels", []): + self.textcat_per_cat[label] = ROCAUCScore() + else: + for label in model.cfg.get("labels", []): + self.textcat_per_cat[label] = PRFScore() @property def tags_acc(self): @@ -101,10 +151,47 @@ class Scorer(object): for k, v in self.ner_per_ents.items() } + @property + def textcat_score(self): + """RETURNS (float): f-score on positive label for binary exclusive, + macro-averaged f-score for 3+ exclusive, + macro-averaged AUC ROC score for multilabel (-1 if undefined) + """ + if not self.textcat_multilabel: + # binary multiclass + if self.textcat_positive_label: + return self.textcat.fscore * 100 + # other multiclass + return ( + sum([score.fscore for label, score in self.textcat_per_cat.items()]) + / (len(self.textcat_per_cat) + 1e-100) + * 100 + ) + # multilabel + return max( + sum([score.score for label, score in self.textcat_per_cat.items()]) + / (len(self.textcat_per_cat) + 1e-100), + -1, + ) + + @property + def textcats_per_cat(self): + """RETURNS (dict): Scores per textcat label. + """ + if not self.textcat_multilabel: + return { + k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} + for k, v in self.textcat_per_cat.items() + } + return { + k: {"roc_auc_score": max(v.score, -1)} + for k, v in self.textcat_per_cat.items() + } + @property def scores(self): """RETURNS (dict): All scores with keys `uas`, `las`, `ents_p`, - `ents_r`, `ents_f`, `tags_acc` and `token_acc`. + `ents_r`, `ents_f`, `tags_acc`, `token_acc`, and `textcat_score`. """ return { "uas": self.uas, @@ -115,6 +202,8 @@ class Scorer(object): "ents_per_type": self.ents_per_type, "tags_acc": self.tags_acc, "token_acc": self.token_acc, + "textcat_score": self.textcat_score, + "textcats_per_cat": self.textcats_per_cat, } def score(self, doc, gold, verbose=False, punct_labels=("p", "punct")): @@ -192,9 +281,297 @@ class Scorer(object): self.unlabelled.score_set( set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps) ) + if ( + len(gold.cats) > 0 + and set(self.textcat_per_cat) == set(gold.cats) + and set(gold.cats) == set(doc.cats) + ): + goldcat = max(gold.cats, key=gold.cats.get) + candcat = max(doc.cats, key=doc.cats.get) + if self.textcat_positive_label: + self.textcat.score_set( + set([self.textcat_positive_label]) & set([candcat]), + set([self.textcat_positive_label]) & set([goldcat]), + ) + for label in self.textcat_per_cat: + if self.textcat_multilabel: + self.textcat_per_cat[label].score_set( + doc.cats[label], gold.cats[label] + ) + else: + self.textcat_per_cat[label].score_set( + set([label]) & set([candcat]), set([label]) & set([goldcat]) + ) + elif len(self.textcat_per_cat) > 0: + model_labels = set(self.textcat_per_cat) + eval_labels = set(gold.cats) + raise ValueError( + Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels) + ) if verbose: gold_words = [item[1] for item in gold.orig_annot] for w_id, h_id, dep in cand_deps - gold_deps: print("F", gold_words[w_id], dep, gold_words[h_id]) for w_id, h_id, dep in gold_deps - cand_deps: print("M", gold_words[w_id], dep, gold_words[h_id]) + + +############################################################################# +# +# The following implementation of roc_auc_score() is adapted from +# scikit-learn, which is distributed under the following license: +# +# New BSD License +# +# Copyright (c) 2007–2019 The scikit-learn developers. +# All rights reserved. +# +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# a. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# b. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# c. Neither the name of the Scikit-learn Developers nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +# DAMAGE. + +def _roc_auc_score(y_true, y_score): + """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) + from prediction scores. + + Note: this implementation is restricted to the binary classification task + + Parameters + ---------- + y_true : array, shape = [n_samples] or [n_samples, n_classes] + True binary labels or binary label indicators. + The multiclass case expects shape = [n_samples] and labels + with values in ``range(n_classes)``. + + y_score : array, shape = [n_samples] or [n_samples, n_classes] + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). For binary + y_true, y_score is supposed to be the score of the class with greater + label. The multiclass case expects shape = [n_samples, n_classes] + where the scores correspond to probability estimates. + + Returns + ------- + auc : float + + References + ---------- + .. [1] `Wikipedia entry for the Receiver operating characteristic + <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_ + + .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition + Letters, 2006, 27(8):861-874. + + .. [3] `Analyzing a portion of the ROC curve. McClish, 1989 + <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_ + """ + if len(np.unique(y_true)) != 2: + raise ValueError(Errors.E165) + fpr, tpr, _ = _roc_curve(y_true, y_score) + return _auc(fpr, tpr) + + +def _roc_curve(y_true, y_score): + """Compute Receiver operating characteristic (ROC) + + Note: this implementation is restricted to the binary classification task. + + Parameters + ---------- + + y_true : array, shape = [n_samples] + True binary labels. If labels are not either {-1, 1} or {0, 1}, then + pos_label should be explicitly given. + + y_score : array, shape = [n_samples] + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). + + Returns + ------- + fpr : array, shape = [>2] + Increasing false positive rates such that element i is the false + positive rate of predictions with score >= thresholds[i]. + + tpr : array, shape = [>2] + Increasing true positive rates such that element i is the true + positive rate of predictions with score >= thresholds[i]. + + thresholds : array, shape = [n_thresholds] + Decreasing thresholds on the decision function used to compute + fpr and tpr. `thresholds[0]` represents no instances being predicted + and is arbitrarily set to `max(y_score) + 1`. + + Notes + ----- + Since the thresholds are sorted from low to high values, they + are reversed upon returning them to ensure they correspond to both ``fpr`` + and ``tpr``, which are sorted in reversed order during their calculation. + + References + ---------- + .. [1] `Wikipedia entry for the Receiver operating characteristic + <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_ + + .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition + Letters, 2006, 27(8):861-874. + """ + fps, tps, thresholds = _binary_clf_curve(y_true, y_score) + + # Add an extra threshold position + # to make sure that the curve starts at (0, 0) + tps = np.r_[0, tps] + fps = np.r_[0, fps] + thresholds = np.r_[thresholds[0] + 1, thresholds] + + if fps[-1] <= 0: + fpr = np.repeat(np.nan, fps.shape) + else: + fpr = fps / fps[-1] + + if tps[-1] <= 0: + tpr = np.repeat(np.nan, tps.shape) + else: + tpr = tps / tps[-1] + + return fpr, tpr, thresholds + + +def _binary_clf_curve(y_true, y_score): + """Calculate true and false positives per binary classification threshold. + + Parameters + ---------- + y_true : array, shape = [n_samples] + True targets of binary classification + + y_score : array, shape = [n_samples] + Estimated probabilities or decision function + + Returns + ------- + fps : array, shape = [n_thresholds] + A count of false positives, at index i being the number of negative + samples assigned a score >= thresholds[i]. The total number of + negative samples is equal to fps[-1] (thus true negatives are given by + fps[-1] - fps). + + tps : array, shape = [n_thresholds <= len(np.unique(y_score))] + An increasing count of true positives, at index i being the number + of positive samples assigned a score >= thresholds[i]. The total + number of positive samples is equal to tps[-1] (thus false negatives + are given by tps[-1] - tps). + + thresholds : array, shape = [n_thresholds] + Decreasing score values. + """ + pos_label = 1. + + y_true = np.ravel(y_true) + y_score = np.ravel(y_score) + + # make y_true a boolean vector + y_true = (y_true == pos_label) + + # sort scores and corresponding truth values + desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] + y_score = y_score[desc_score_indices] + y_true = y_true[desc_score_indices] + weight = 1. + + # y_score typically has many tied values. Here we extract + # the indices associated with the distinct values. We also + # concatenate a value for the end of the curve. + distinct_value_indices = np.where(np.diff(y_score))[0] + threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1] + + # accumulate the true positives with decreasing threshold + tps = _stable_cumsum(y_true * weight)[threshold_idxs] + fps = 1 + threshold_idxs - tps + return fps, tps, y_score[threshold_idxs] + + +def _stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08): + """Use high precision for cumsum and check that final value matches sum + + Parameters + ---------- + arr : array-like + To be cumulatively summed as flat + axis : int, optional + Axis along which the cumulative sum is computed. + The default (None) is to compute the cumsum over the flattened array. + rtol : float + Relative tolerance, see ``np.allclose`` + atol : float + Absolute tolerance, see ``np.allclose`` + """ + out = np.cumsum(arr, axis=axis, dtype=np.float64) + expected = np.sum(arr, axis=axis, dtype=np.float64) + if not np.all(np.isclose(out.take(-1, axis=axis), expected, rtol=rtol, + atol=atol, equal_nan=True)): + raise ValueError(Errors.E163) + return out + + +def _auc(x, y): + """Compute Area Under the Curve (AUC) using the trapezoidal rule + + This is a general function, given points on a curve. For computing the + area under the ROC-curve, see :func:`roc_auc_score`. + + Parameters + ---------- + x : array, shape = [n] + x coordinates. These must be either monotonic increasing or monotonic + decreasing. + y : array, shape = [n] + y coordinates. + + Returns + ------- + auc : float + """ + x = np.ravel(x) + y = np.ravel(y) + + direction = 1 + dx = np.diff(x) + if np.any(dx < 0): + if np.all(dx <= 0): + direction = -1 + else: + raise ValueError(Errors.E164.format(x)) + + area = direction * np.trapz(y, x) + if isinstance(area, np.memmap): + # Reductions such as .sum used internally in np.trapz do not return a + # scalar by default for numpy.memmap instances contrary to + # regular numpy.ndarray instances. + area = area.dtype.type(area) + return area diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index eb39124ce..5a7355061 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -342,6 +342,7 @@ cdef class ArcEager(TransitionSystem): actions[RIGHT][label] = 1 actions[REDUCE][label] = 1 for raw_text, sents in kwargs.get('gold_parses', []): + _ = sents.pop() for (ids, words, tags, heads, labels, iob), ctnts in sents: heads, labels = nonproj.projectivize(heads, labels) for child, head, label in zip(ids, heads, labels): diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 767e4c2e0..a55f1ce3a 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -72,6 +72,7 @@ cdef class BiluoPushDown(TransitionSystem): actions[action][entity_type] = 1 moves = ('M', 'B', 'I', 'L', 'U') for raw_text, sents in kwargs.get('gold_parses', []): + _ = sents.pop() for (ids, words, tags, heads, labels, biluo), _ in sents: for i, ner_tag in enumerate(biluo): if ner_tag != 'O' and ner_tag != '-': diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index c4edef137..68d19c691 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -589,6 +589,7 @@ cdef class Parser: doc_sample = [] gold_sample = [] for raw_text, annots_brackets in islice(get_gold_tuples(), 1000): + _ = annots_brackets.pop() for annots, brackets in annots_brackets: ids, words, tags, heads, deps, ents = annots doc_sample.append(Doc(self.vocab, words=words)) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 860540be2..4f79c4463 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -3,8 +3,12 @@ from __future__ import unicode_literals from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags from spacy.gold import spans_from_biluo_tags, GoldParse +from spacy.gold import GoldCorpus, docs_to_json +from spacy.lang.en import English from spacy.tokens import Doc +from .util import make_tempdir import pytest +import srsly def test_gold_biluo_U(en_vocab): @@ -81,3 +85,28 @@ def test_gold_ner_missing_tags(en_tokenizer): doc = en_tokenizer("I flew to Silicon Valley via London.") biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] gold = GoldParse(doc, entities=biluo_tags) # noqa: F841 + + +def test_roundtrip_docs_to_json(): + text = "I flew to Silicon Valley via London." + cats = {"TRAVEL": 1.0, "BAKING": 0.0} + nlp = English() + doc = nlp(text) + doc.cats = cats + doc[0].is_sent_start = True + for i in range(1, len(doc)): + doc[i].is_sent_start = False + + with make_tempdir() as tmpdir: + json_file = tmpdir / "roundtrip.json" + srsly.write_json(json_file, [docs_to_json(doc)]) + goldcorpus = GoldCorpus(str(json_file), str(json_file)) + + reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp)) + + assert len(doc) == goldcorpus.count_train() + assert text == reloaded_doc.text + assert "TRAVEL" in goldparse.cats + assert "BAKING" in goldparse.cats + assert cats["TRAVEL"] == goldparse.cats["TRAVEL"] + assert cats["BAKING"] == goldparse.cats["BAKING"] diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index a747d3adb..31d9cf875 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -1,9 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import numpy as np +from numpy.testing import assert_almost_equal, assert_array_almost_equal +import pytest from pytest import approx +from spacy.errors import Errors from spacy.gold import GoldParse -from spacy.scorer import Scorer +from spacy.scorer import Scorer, ROCAUCScore +from spacy.scorer import _roc_auc_score, _roc_curve from .util import get_doc test_ner_cardinal = [ @@ -66,3 +71,74 @@ def test_ner_per_type(en_vocab): assert results["ents_per_type"]["ORG"]["p"] == 50 assert results["ents_per_type"]["ORG"]["r"] == 100 assert results["ents_per_type"]["ORG"]["f"] == approx(66.66666) + + +def test_roc_auc_score(): + # Binary classification, toy tests from scikit-learn test suite + y_true = [0, 1] + y_score = [0, 1] + tpr, fpr, _ = _roc_curve(y_true, y_score) + roc_auc = _roc_auc_score(y_true, y_score) + assert_array_almost_equal(tpr, [0, 0, 1]) + assert_array_almost_equal(fpr, [0, 1, 1]) + assert_almost_equal(roc_auc, 1.) + + y_true = [0, 1] + y_score = [1, 0] + tpr, fpr, _ = _roc_curve(y_true, y_score) + roc_auc = _roc_auc_score(y_true, y_score) + assert_array_almost_equal(tpr, [0, 1, 1]) + assert_array_almost_equal(fpr, [0, 0, 1]) + assert_almost_equal(roc_auc, 0.) + + y_true = [1, 0] + y_score = [1, 1] + tpr, fpr, _ = _roc_curve(y_true, y_score) + roc_auc = _roc_auc_score(y_true, y_score) + assert_array_almost_equal(tpr, [0, 1]) + assert_array_almost_equal(fpr, [0, 1]) + assert_almost_equal(roc_auc, 0.5) + + y_true = [1, 0] + y_score = [1, 0] + tpr, fpr, _ = _roc_curve(y_true, y_score) + roc_auc = _roc_auc_score(y_true, y_score) + assert_array_almost_equal(tpr, [0, 0, 1]) + assert_array_almost_equal(fpr, [0, 1, 1]) + assert_almost_equal(roc_auc, 1.) + + y_true = [1, 0] + y_score = [0.5, 0.5] + tpr, fpr, _ = _roc_curve(y_true, y_score) + roc_auc = _roc_auc_score(y_true, y_score) + assert_array_almost_equal(tpr, [0, 1]) + assert_array_almost_equal(fpr, [0, 1]) + assert_almost_equal(roc_auc, .5) + + # same result as above with ROCAUCScore wrapper + score = ROCAUCScore() + score.score_set(0.5, 1) + score.score_set(0.5, 0) + assert_almost_equal(score.score, .5) + + + # check that errors are raised in undefined cases and score is -inf + y_true = [0, 0] + y_score = [0.25, 0.75] + with pytest.raises(ValueError): + _roc_auc_score(y_true, y_score) + + score = ROCAUCScore() + score.score_set(0.25, 0) + score.score_set(0.75, 0) + assert score.score == -float("inf") + + y_true = [1, 1] + y_score = [0.25, 0.75] + with pytest.raises(ValueError): + _roc_auc_score(y_true, y_score) + + score = ROCAUCScore() + score.score_set(0.25, 1) + score.score_set(0.75, 1) + assert score.score == -float("inf") diff --git a/website/docs/api/annotation.md b/website/docs/api/annotation.md index f44019752..a83879de6 100644 --- a/website/docs/api/annotation.md +++ b/website/docs/api/annotation.md @@ -552,6 +552,10 @@ spaCy's JSON format, you can use the "last": int, # index of last token "label": string # phrase label }] + }], + "cats": [{ # new in v2.2: categories for text classifier + "label": string, # text category label + "value": float / bool # label applies (1.0/true) or not (0.0/false) }] }] }] diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index d13490a27..0ce4475b7 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -361,9 +361,10 @@ will only train the tagger and parser. ```bash $ python -m spacy train [lang] [output_path] [train_path] [dev_path] -[--base-model] [--pipeline] [--vectors] [--n-iter] [--n-early-stopping] [--n-examples] [--use-gpu] -[--version] [--meta-path] [--init-tok2vec] [--parser-multitasks] -[--entity-multitasks] [--gold-preproc] [--noise-level] [--learn-tokens] +[--base-model] [--pipeline] [--vectors] [--n-iter] [--n-early-stopping] +[--n-examples] [--use-gpu] [--version] [--meta-path] [--init-tok2vec] +[--parser-multitasks] [--entity-multitasks] [--gold-preproc] [--noise-level] +[--learn-tokens] [--textcat-arch] [--textcat-multilabel] [--textcat-positive-label] [--verbose] ``` @@ -387,7 +388,10 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` | | `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. | | `--gold-preproc`, `-G` | flag | Use gold preprocessing. | -| `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging subtokens. Typically used for languages like Chinese. | +| `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging ] subtokens. Typically used for languages like Chinese. | +| `--textcat-multilabel`, `-TML` <Tag variant="new">2.2</Tag> | flag | Text classification classes aren't mutually exclusive (multilabel). | +| `--textcat-arch`, `-ta` <Tag variant="new">2.2</Tag> | option | Text classification model architecture. Defaults to `"bow"`. | +| `--textcat-positive-label`, `-tpl` <Tag variant="new">2.2</Tag> | option |Text classification positive label for binary classes with two labels. | | `--verbose`, `-VV` <Tag variant="new">2.0.13</Tag> | flag | Show more detailed messages during training. | | `--help`, `-h` | flag | Show help message and available arguments. | | **CREATES** | model, pickle | A spaCy model on each epoch. | diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index 2af4ec0ce..35348217b 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -46,14 +46,16 @@ Update the evaluation scores from a single [`Doc`](/api/doc) / ## Properties -| Name | Type | Description | -| ---------------------------------------------- | ----- | ------------------------------------------------------------------------------------------------------------- | -| `token_acc` | float | Tokenization accuracy. | -| `tags_acc` | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`). | -| `uas` | float | Unlabelled dependency score. | -| `las` | float | Labelled dependency score. | -| `ents_p` | float | Named entity accuracy (precision). | -| `ents_r` | float | Named entity accuracy (recall). | -| `ents_f` | float | Named entity accuracy (F-score). | -| `ents_per_type` <Tag variant="new">2.1.5</Tag> | dict | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores. | -| `scores` | dict | All scores with keys `uas`, `las`, `ents_p`, `ents_r`, `ents_f`, `ents_per_type`, `tags_acc` and `token_acc`. | +| Name | Type | Description | +| ----------------------------------------------- | ----- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `token_acc` | float | Tokenization accuracy. | +| `tags_acc` | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`). | +| `uas` | float | Unlabelled dependency score. | +| `las` | float | Labelled dependency score. | +| `ents_p` | float | Named entity accuracy (precision). | +| `ents_r` | float | Named entity accuracy (recall). | +| `ents_f` | float | Named entity accuracy (F-score). | +| `ents_per_type` <Tag variant="new">2.1.5</Tag> | dict | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores. | +| `textcat_score` <Tag variant="new">2.2</Tag> | float | F-score on positive label for binary exclusive, macro-averaged F-score for 3+ exclusive, macro-averaged AUC ROC score for multilabel (`-1` if undefined). | +| `textcats_per_cat` <Tag variant="new">2.2</Tag> | dict | Scores per textcat label, keyed by label. | +| `scores` | dict | All scores, keyed by type. | From 3ba5238282d5ea84d5d2a71b5940de30fbaf3331 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 16 Sep 2019 15:16:12 +0200 Subject: [PATCH 166/207] Make "unnamed vectors" warning a real warning --- spacy/_ml.py | 5 +---- spacy/errors.py | 2 ++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index d81ceccc1..6104324ab 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -286,10 +286,7 @@ def link_vectors_to_models(vocab): if vectors.name is None: vectors.name = VECTORS_KEY if vectors.data.size != 0: - print( - "Warning: Unnamed vectors -- this won't allow multiple vectors " - "models to be loaded. (Shape: (%d, %d))" % vectors.data.shape - ) + user_warning(Warnings.W020.format(shape=vectors.data.shape)) ops = Model.ops for word in vocab: if word.orth in vectors.key2row: diff --git a/spacy/errors.py b/spacy/errors.py index 80c4c6f85..b2a201773 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -84,6 +84,8 @@ class Warnings(object): W018 = ("Entity '{entity}' already exists in the Knowledge base.") W019 = ("Changing vectors name from {old} to {new}, to avoid clash with " "previously loaded vectors. See Issue #3853.") + W020 = ("Unnamed vectors. This won't allow multiple vectors models to be " + "loaded. (Shape: {shape})") @add_codes From cb6c68a57371e9914271b57585b9e92e8cfb9fcf Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 16 Sep 2019 15:16:29 +0200 Subject: [PATCH 167/207] Pass vectors name correctly in prune_vectors --- spacy/vocab.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 021da02fc..869c92921 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -318,7 +318,7 @@ cdef class Vocab: keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64") keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]]) toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]]) - self.vectors = Vectors(data=keep, keys=keys) + self.vectors = Vectors(data=keep, keys=keys, name=self.vectors.name) syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size) remap = {} for i, key in enumerate(keys[nr_row:]): From bf06d9d537030799228d8fdc4b20a30a74c4e512 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 16 Sep 2019 15:16:41 +0200 Subject: [PATCH 168/207] Allow passing vectors_name to Vocab --- spacy/vocab.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 869c92921..9c9f85d05 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -33,7 +33,8 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab """ def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, - strings=tuple(), lookups=None, oov_prob=-20., **deprecated_kwargs): + strings=tuple(), lookups=None, oov_prob=-20., vectors_name=None, + **deprecated_kwargs): """Create the vocabulary. lex_attr_getters (dict): A dictionary mapping attribute IDs to @@ -62,7 +63,7 @@ cdef class Vocab: _ = self[string] self.lex_attr_getters = lex_attr_getters self.morphology = Morphology(self.strings, tag_map, lemmatizer) - self.vectors = Vectors() + self.vectors = Vectors(name=vectors_name) self.lookups = lookups @property From 139428c20f357f7d5b474c1b996e9a8a79f636ed Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 16 Sep 2019 15:16:54 +0200 Subject: [PATCH 169/207] Set unique vector names in tests --- spacy/tests/regression/test_issue1501-2000.py | 2 +- spacy/tests/regression/test_issue2501-3000.py | 2 +- spacy/tests/vocab_vectors/test_vectors.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 24f725ab8..520090bb4 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -187,7 +187,7 @@ def test_issue1799(): def test_issue1807(): """Test vocab.set_vector also adds the word to the vocab.""" - vocab = Vocab() + vocab = Vocab(vectors_name="test_issue1807") assert "hello" not in vocab vocab.set_vector("hello", numpy.ones((50,), dtype="f")) assert "hello" in vocab diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index cf29c2535..a0b1e2aac 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -184,7 +184,7 @@ def test_issue2833(en_vocab): def test_issue2871(): """Test that vectors recover the correct key for spaCy reserved words.""" words = ["dog", "cat", "SUFFIX"] - vocab = Vocab() + vocab = Vocab(vectors_name="test_issue2871") vocab.vectors.resize(shape=(3, 10)) vector_data = numpy.zeros((3, 10), dtype="f") for word in words: diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 2a828de9c..4226bca3b 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -259,7 +259,7 @@ def test_vectors_doc_doc_similarity(vocab, text1, text2): def test_vocab_add_vector(): - vocab = Vocab() + vocab = Vocab(vectors_name="test_vocab_add_vector") data = numpy.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 @@ -272,7 +272,7 @@ def test_vocab_add_vector(): def test_vocab_prune_vectors(): - vocab = Vocab() + vocab = Vocab(vectors_name="test_vocab_prune_vectors") _ = vocab["cat"] # noqa: F841 _ = vocab["dog"] # noqa: F841 _ = vocab["kitten"] # noqa: F841 From a84025d70b276df3a84b31b024525c51c04f53cf Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 16 Sep 2019 23:32:41 +0200 Subject: [PATCH 170/207] Remove --no-deps from default pip args on download Add warning if user is executing spaCy without having it installed and add --no-deps to prevent the package from being redownloaded --- spacy/cli/download.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 8a993178a..64ab03a75 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -28,6 +28,16 @@ def download(model, direct=False, *pip_args): can be shortcut, model name or, if --direct flag is set, full model name with version. For direct downloads, the compatibility check will be skipped. """ + if not require_package("spacy") and "--no-deps" not in pip_args: + msg.warn( + "Skipping model package dependencies and setting `--no-deps`. " + "You don't seem to have the spaCy package itself installed " + "(maybe because you've built from source?), so installing the " + "model dependencies would cause spaCy to be downloaded, which " + "probably isn't what you want. If the model package has other " + "dependencies, you'll have to install them manually." + ) + pip_args = pip_args + ("--no-deps",) dl_tpl = "{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}" if direct: components = model.split("-") @@ -72,12 +82,15 @@ def download(model, direct=False, *pip_args): # is_package check currently fails, because pkg_resources.working_set # is not refreshed automatically (see #3923). We're trying to work # around this here be requiring the package explicitly. - try: - pkg_resources.working_set.require(model_name) - except: # noqa: E722 - # Maybe it's possible to remove this – mostly worried about cross- - # platform and cross-Python copmpatibility here - pass + require_package(model_name) + + +def require_package(name): + try: + pkg_resources.working_set.require(name) + return True + except: # noqa: E722 + return False def get_json(url, desc): @@ -117,7 +130,7 @@ def get_version(model, comp): def download_model(filename, user_pip_args=None): download_url = about.__download_url__ + "/" + filename - pip_args = ["--no-cache-dir", "--no-deps"] + pip_args = ["--no-cache-dir"] if user_pip_args: pip_args.extend(user_pip_args) cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url] From ea2a686cf71874257b76465d8335b37e60409e19 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Wed, 18 Sep 2019 11:42:45 +0200 Subject: [PATCH 171/207] Support new model sources format [ci skip] --- website/src/templates/models.js | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 29278e919..1c90c60b6 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -108,6 +108,17 @@ function formatModelMeta(data) { } } +function formatSources(data = []) { + const sources = Array.isArray(data) ? data.map(s => ({ name: s })) : data + return sources.map(({ name, url, author }, i) => ( + <> + {i > 0 && ', '} + {name && url ? <Link to={url}>{name}</Link> : name} + {author && ` (${author})`} + </> + )) +} + const Help = ({ children }) => ( <span data-tooltip={children}> <Icon name="help2" width={16} variant="subtle" inline /> @@ -142,7 +153,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl const releaseUrl = `https://github.com/${repo}/releases/${releaseTag}` const pipeline = meta.pipeline && join(meta.pipeline.map(p => <InlineCode key={p}>{p}</InlineCode>)) - const sources = meta.sources && join(meta.sources) + const sources = formatSources(meta.sources) const author = !meta.url ? meta.author : <Link to={meta.url}>{meta.author}</Link> const licenseUrl = licenses[meta.license] ? licenses[meta.license].url : null const license = licenseUrl ? <Link to={licenseUrl}>{meta.license}</Link> : meta.license From c922f8e8b058e29536b0c26682be92859dc9e00d Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Wed, 18 Sep 2019 12:09:21 +0200 Subject: [PATCH 172/207] Fix sources rendering [ci skip] --- website/src/templates/models.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 1c90c60b6..4a46be9cb 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -15,7 +15,7 @@ import Link from '../components/link' import Grid from '../components/grid' import Infobox from '../components/infobox' import Accordion from '../components/accordion' -import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util' +import { join, arrayToObj, abbrNum, markdownToReact, isString } from '../components/util' const MODEL_META = { core: 'Vocabulary, syntax, entities, vectors', @@ -109,7 +109,7 @@ function formatModelMeta(data) { } function formatSources(data = []) { - const sources = Array.isArray(data) ? data.map(s => ({ name: s })) : data + const sources = data.map(s => (isString(s) ? { name: s } : s)) return sources.map(({ name, url, author }, i) => ( <> {i > 0 && ', '} From 1c8de6b2e514e4d0f9a23173495b956509aadb3f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 18 Sep 2019 13:13:51 +0200 Subject: [PATCH 173/207] Rename DocBox->DocPallet --- spacy/tokens/__init__.py | 3 ++- spacy/tokens/_serialize.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py index 5722d45bc..28d86bb18 100644 --- a/spacy/tokens/__init__.py +++ b/spacy/tokens/__init__.py @@ -4,5 +4,6 @@ from __future__ import unicode_literals from .doc import Doc from .token import Token from .span import Span +from ._serialize import DocPallet -__all__ = ["Doc", "Token", "Span"] +__all__ = ["Doc", "Token", "Span", "DocPallet"] diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 41f524839..222806545 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -11,7 +11,7 @@ from ..tokens import Doc from ..attrs import SPACY, ORTH -class DocBox(object): +class DocPallet(object): """Serialize analyses from a collection of doc objects.""" def __init__(self, attrs=None, store_user_data=False): From 3507943b15e27cad91e55d2c3ff24ad0d959c683 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 18 Sep 2019 13:25:47 +0200 Subject: [PATCH 174/207] Add docstring for DocPallet --- spacy/tokens/_serialize.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 222806545..473d941b4 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -12,8 +12,34 @@ from ..attrs import SPACY, ORTH class DocPallet(object): - """Serialize analyses from a collection of doc objects.""" + """Pack Doc objects for export. + + The DocPallet class lets you efficiently serialize the information from a + collection of Doc objects. You can control which information is serialized + by passing a list of attribute IDs, and optionally also specify whether the + user data is serialized. The DocPallet is faster and produces smaller data + sizes than pickle, and allows you to deserialize without executing arbitrary + Python code. + The serialization format is gzipped msgpack, where the msgpack object has + the following structure: + + { + "attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE] + "tokens": bytes, # Serialized numpy uint64 array with the token data + "spaces": bytes, # Serialized numpy boolean array with spaces data + "lengths": bytes, # Serialized numpy int32 array with the doc lengths + "strings": List[unicode] # List of unique strings in the token data + } + + Strings for the words, tags, labels etc are represented by 64-bit hashes in + the token data, and every string that occurs at least once is passed via the + strings object. This means the storage is more efficient if you pack more + documents together, because you have less duplication in the strings. + + A notable downside to this format is that you can't easily extract just one + document from the pallet. + """ def __init__(self, attrs=None, store_user_data=False): """Create a DocBox object, to hold serialized annotations. From 88a23cf49ab64df260079bbf022fc309f6d42540 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 18 Sep 2019 13:38:29 +0200 Subject: [PATCH 175/207] Fix name --- spacy/tokens/_serialize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 473d941b4..dfdad33e7 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -157,4 +157,4 @@ copy_reg.pickle(DocBox, pickle_box, unpickle_box) # Compatibility, as we had named it this previously. Binder = DocBox -__all__ = ["DocBox"] +__all__ = ["DocPallet"] From fa9a283128649c162acdfa82066ba330a1c7e9bb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 18 Sep 2019 13:40:03 +0200 Subject: [PATCH 176/207] Fix name --- spacy/tokens/_serialize.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index dfdad33e7..51d3fc558 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -41,7 +41,7 @@ class DocPallet(object): document from the pallet. """ def __init__(self, attrs=None, store_user_data=False): - """Create a DocBox object, to hold serialized annotations. + """Create a DocPallet object, to hold serialized annotations. attrs (list): List of attributes to serialize. 'orth' and 'spacy' are always serialized, so they're not required. Defaults to None. @@ -57,7 +57,7 @@ class DocPallet(object): self.store_user_data = store_user_data def add(self, doc): - """Add a doc's annotations to the DocBox for serialization.""" + """Add a doc's annotations to the DocPallet for serialization.""" array = doc.to_array(self.attrs) if len(array.shape) == 1: array = array.reshape((array.shape[0], 1)) @@ -86,7 +86,7 @@ class DocPallet(object): yield doc def merge(self, other): - """Extend the annotations of this DocBox with the annotations from another.""" + """Extend the annotations of this DocPallet with the annotations from another.""" assert self.attrs == other.attrs self.tokens.extend(other.tokens) self.spaces.extend(other.spaces) @@ -95,7 +95,7 @@ class DocPallet(object): self.user_data.extend(other.user_data) def to_bytes(self): - """Serialize the DocBox's annotations into a byte string.""" + """Serialize the DocPallet's annotations into a byte string.""" for tokens in self.tokens: assert len(tokens.shape) == 2, tokens.shape lengths = [len(tokens) for tokens in self.tokens] @@ -111,7 +111,7 @@ class DocPallet(object): return gzip.compress(srsly.msgpack_dumps(msg)) def from_bytes(self, string): - """Deserialize the DocBox's annotations from a byte string.""" + """Deserialize the DocPallet's annotations from a byte string.""" msg = srsly.msgpack_loads(gzip.decompress(string)) self.attrs = msg["attrs"] self.strings = set(msg["strings"]) @@ -134,7 +134,7 @@ def merge_boxes(boxes): merged = None for byte_string in boxes: if byte_string is not None: - box = DocBox(store_user_data=True).from_bytes(byte_string) + box = DocPallet(store_user_data=True).from_bytes(byte_string) if merged is None: merged = box else: @@ -150,11 +150,11 @@ def pickle_box(box): def unpickle_box(byte_string): - return DocBox().from_bytes(byte_string) + return DocPallet().from_bytes(byte_string) -copy_reg.pickle(DocBox, pickle_box, unpickle_box) +copy_reg.pickle(DocPallet, pickle_box, unpickle_box) # Compatibility, as we had named it this previously. -Binder = DocBox +Binder = DocPallet __all__ = ["DocPallet"] From f537cbeacc4bdea6546f9bae17617302d7a6a547 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 18 Sep 2019 14:07:55 +0200 Subject: [PATCH 177/207] Update v2-2 docs --- website/docs/usage/v2-2.md | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/v2-2.md b/website/docs/usage/v2-2.md index 2109ae812..8f339cb9b 100644 --- a/website/docs/usage/v2-2.md +++ b/website/docs/usage/v2-2.md @@ -98,9 +98,10 @@ on disk**. > #### Example > -> ```python -> scorer = nlp.evaluate(dev_data) -> print(scorer.textcat_scores, scorer.textcats_per_cat) +> ```bash +> spacy train en /path/to/output /path/to/train /path/to/dev \ +> --pipeline textcat \ +> --textcat-arch simple_cnn --textcat-multilabel > ``` When training your models using the `spacy train` command, you can now also @@ -117,6 +118,34 @@ classification. </Infobox> +### New DocPallet class to efficiently Doc collections + +> #### Example +> +> ```python +> from spacy.tokens import DocPallet +> pallet = DocPallet(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=False) +> for doc in nlp.pipe(texts): +> pallet.add(doc) +> byte_data = pallet.to_bytes() +> # Deserialize later, e.g. in a new process +> nlp = spacy.blank("en") +> pallet = DocPallet() +> docs = list(pallet.get_docs(nlp.vocab)) +> ``` + +If you're working with lots of data, you'll probably need to pass analyses +between machines, either to use something like Dask or Spark, or even just to +save out work to disk. Often it's sufficient to use the doc.to_array() +functionality for this, and just serialize the numpy arrays --- but other times +you want a more general way to save and restore `Doc` objects. + +The new `DocPallet` class makes it easy to serialize and deserialize +a collection of `Doc` objects together, and is much more efficient than +calling `doc.to_bytes()` on each individual `Doc` object. You can also control +what data gets saved, and you can merge pallets together for easy +map/reduce-style processing. + ### CLI command to debug and validate training data {#debug-data} > #### Example From e53b86751f2aa288fe8b83f6c5df2447e6b865e8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 18 Sep 2019 15:15:37 +0200 Subject: [PATCH 178/207] DocPallet -> DocBin --- spacy/tokens/__init__.py | 4 ++-- spacy/tokens/_serialize.py | 42 +++++++++++++++++++------------------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py index 28d86bb18..536ec8349 100644 --- a/spacy/tokens/__init__.py +++ b/spacy/tokens/__init__.py @@ -4,6 +4,6 @@ from __future__ import unicode_literals from .doc import Doc from .token import Token from .span import Span -from ._serialize import DocPallet +from ._serialize import DocBin -__all__ = ["Doc", "Token", "Span", "DocPallet"] +__all__ = ["Doc", "Token", "Span", "DocBin"] diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 51d3fc558..54078fe60 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -11,13 +11,13 @@ from ..tokens import Doc from ..attrs import SPACY, ORTH -class DocPallet(object): - """Pack Doc objects for export. +class DocBin(object): + """Pack Doc objects for binary serialization. - The DocPallet class lets you efficiently serialize the information from a + The DocBin class lets you efficiently serialize the information from a collection of Doc objects. You can control which information is serialized by passing a list of attribute IDs, and optionally also specify whether the - user data is serialized. The DocPallet is faster and produces smaller data + user data is serialized. The DocBin is faster and produces smaller data sizes than pickle, and allows you to deserialize without executing arbitrary Python code. @@ -41,7 +41,7 @@ class DocPallet(object): document from the pallet. """ def __init__(self, attrs=None, store_user_data=False): - """Create a DocPallet object, to hold serialized annotations. + """Create a DocBin object, to hold serialized annotations. attrs (list): List of attributes to serialize. 'orth' and 'spacy' are always serialized, so they're not required. Defaults to None. @@ -57,7 +57,7 @@ class DocPallet(object): self.store_user_data = store_user_data def add(self, doc): - """Add a doc's annotations to the DocPallet for serialization.""" + """Add a doc's annotations to the DocBin for serialization.""" array = doc.to_array(self.attrs) if len(array.shape) == 1: array = array.reshape((array.shape[0], 1)) @@ -86,7 +86,7 @@ class DocPallet(object): yield doc def merge(self, other): - """Extend the annotations of this DocPallet with the annotations from another.""" + """Extend the annotations of this DocBin with the annotations from another.""" assert self.attrs == other.attrs self.tokens.extend(other.tokens) self.spaces.extend(other.spaces) @@ -95,7 +95,7 @@ class DocPallet(object): self.user_data.extend(other.user_data) def to_bytes(self): - """Serialize the DocPallet's annotations into a byte string.""" + """Serialize the DocBin's annotations into a byte string.""" for tokens in self.tokens: assert len(tokens.shape) == 2, tokens.shape lengths = [len(tokens) for tokens in self.tokens] @@ -111,7 +111,7 @@ class DocPallet(object): return gzip.compress(srsly.msgpack_dumps(msg)) def from_bytes(self, string): - """Deserialize the DocPallet's annotations from a byte string.""" + """Deserialize the DocBin's annotations from a byte string.""" msg = srsly.msgpack_loads(gzip.decompress(string)) self.attrs = msg["attrs"] self.strings = set(msg["strings"]) @@ -130,31 +130,31 @@ class DocPallet(object): return self -def merge_boxes(boxes): +def merge_bins(bins): merged = None - for byte_string in boxes: + for byte_string in bins: if byte_string is not None: - box = DocPallet(store_user_data=True).from_bytes(byte_string) + doc_bin = DocBin(store_user_data=True).from_bytes(byte_string) if merged is None: - merged = box + merged = doc_bin else: - merged.merge(box) + merged.merge(doc_bin) if merged is not None: return merged.to_bytes() else: return b"" -def pickle_box(box): - return (unpickle_box, (box.to_bytes(),)) +def pickle_bin(docbin): + return (unpickle_bin, (bin_.to_bytes(),)) -def unpickle_box(byte_string): - return DocPallet().from_bytes(byte_string) +def unpickle_bin(byte_string): + return DocBin().from_bytes(byte_string) -copy_reg.pickle(DocPallet, pickle_box, unpickle_box) +copy_reg.pickle(DocBin, pickle_bin, unpickle_bin) # Compatibility, as we had named it this previously. -Binder = DocPallet +Binder = DocBin -__all__ = ["DocPallet"] +__all__ = ["DocBin"] From 931e96b6c7961df670ec84bec9f7b917d531bd55 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 18 Sep 2019 15:16:57 +0200 Subject: [PATCH 179/207] DocPallet->DocBin in docs --- website/docs/usage/v2-2.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/website/docs/usage/v2-2.md b/website/docs/usage/v2-2.md index 8f339cb9b..b6bd308e0 100644 --- a/website/docs/usage/v2-2.md +++ b/website/docs/usage/v2-2.md @@ -118,20 +118,20 @@ classification. </Infobox> -### New DocPallet class to efficiently Doc collections +### New DocBin class to efficiently serialize Doc collections > #### Example > > ```python -> from spacy.tokens import DocPallet -> pallet = DocPallet(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=False) +> from spacy.tokens import DocBin +> doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=False) > for doc in nlp.pipe(texts): -> pallet.add(doc) -> byte_data = pallet.to_bytes() +> doc_bin.add(doc) +> byte_data = docbin.to_bytes() > # Deserialize later, e.g. in a new process > nlp = spacy.blank("en") -> pallet = DocPallet() -> docs = list(pallet.get_docs(nlp.vocab)) +> doc_bin = DocBin() +> docs = list(doc_bin.get_docs(nlp.vocab)) > ``` If you're working with lots of data, you'll probably need to pass analyses @@ -140,7 +140,7 @@ save out work to disk. Often it's sufficient to use the doc.to_array() functionality for this, and just serialize the numpy arrays --- but other times you want a more general way to save and restore `Doc` objects. -The new `DocPallet` class makes it easy to serialize and deserialize +The new `DocBin` class makes it easy to serialize and deserialize a collection of `Doc` objects together, and is much more efficient than calling `doc.to_bytes()` on each individual `Doc` object. You can also control what data gets saved, and you can merge pallets together for easy From 0f7fe5e7a7fff016597a38e73b223e9170ab1ea8 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Wed, 18 Sep 2019 19:18:30 +0200 Subject: [PATCH 180/207] Auto-format and fix typo and consistency --- spacy/tokens/_serialize.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 54078fe60..8e4e24d46 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -13,7 +13,7 @@ from ..attrs import SPACY, ORTH class DocBin(object): """Pack Doc objects for binary serialization. - + The DocBin class lets you efficiently serialize the information from a collection of Doc objects. You can control which information is serialized by passing a list of attribute IDs, and optionally also specify whether the @@ -23,7 +23,7 @@ class DocBin(object): The serialization format is gzipped msgpack, where the msgpack object has the following structure: - + { "attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE] "tokens": bytes, # Serialized numpy uint64 array with the token data @@ -40,6 +40,7 @@ class DocBin(object): A notable downside to this format is that you can't easily extract just one document from the pallet. """ + def __init__(self, attrs=None, store_user_data=False): """Create a DocBin object, to hold serialized annotations. @@ -145,8 +146,8 @@ def merge_bins(bins): return b"" -def pickle_bin(docbin): - return (unpickle_bin, (bin_.to_bytes(),)) +def pickle_bin(doc_bin): + return (unpickle_bin, (doc_bin.to_bytes(),)) def unpickle_bin(byte_string): From bd435fadddfe21d6f3e94a32ed350ca4d49772b3 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Wed, 18 Sep 2019 19:56:43 +0200 Subject: [PATCH 181/207] Add note about usage docs [ci skip] --- website/docs/api/entityruler.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index 006ba90e6..5b93fceac 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -10,7 +10,9 @@ token-based rules or exact phrase matches. It can be combined with the statistical [`EntityRecognizer`](/api/entityrecognizer) to boost accuracy, or used on its own to implement a purely rule-based entity recognition system. After initialization, the component is typically added to the processing -pipeline using [`nlp.add_pipe`](/api/language#add_pipe). +pipeline using [`nlp.add_pipe`](/api/language#add_pipe). For usage examples, see +the docs on +[rule-based entity recogntion](/usage/rule-based-matching#entityruler). ## EntityRuler.\_\_init\_\_ {#init tag="method"} From 1f648ecb76cd3fe5c78008d0d7db41eb6dd3b2bc Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Wed, 18 Sep 2019 19:56:55 +0200 Subject: [PATCH 182/207] Auto-format --- spacy/scorer.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/spacy/scorer.py b/spacy/scorer.py index 30ed389cd..19f157451 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -338,7 +338,7 @@ class Scorer(object): # c. Neither the name of the Scikit-learn Developers nor the names of # its contributors may be used to endorse or promote products # derived from this software without specific prior written -# permission. +# permission. # # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" @@ -353,6 +353,7 @@ class Scorer(object): # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH # DAMAGE. + def _roc_auc_score(y_true, y_score): """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores. @@ -490,19 +491,19 @@ def _binary_clf_curve(y_true, y_score): thresholds : array, shape = [n_thresholds] Decreasing score values. """ - pos_label = 1. + pos_label = 1.0 y_true = np.ravel(y_true) y_score = np.ravel(y_score) # make y_true a boolean vector - y_true = (y_true == pos_label) + y_true = y_true == pos_label # sort scores and corresponding truth values desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] y_score = y_score[desc_score_indices] y_true = y_true[desc_score_indices] - weight = 1. + weight = 1.0 # y_score typically has many tied values. Here we extract # the indices associated with the distinct values. We also @@ -533,8 +534,11 @@ def _stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08): """ out = np.cumsum(arr, axis=axis, dtype=np.float64) expected = np.sum(arr, axis=axis, dtype=np.float64) - if not np.all(np.isclose(out.take(-1, axis=axis), expected, rtol=rtol, - atol=atol, equal_nan=True)): + if not np.all( + np.isclose( + out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True + ) + ): raise ValueError(Errors.E163) return out From 2e5ab5b59c29f2d879b4f2a7f820c872f798d0ab Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Wed, 18 Sep 2019 19:57:08 +0200 Subject: [PATCH 183/207] Make except more explicit --- spacy/scorer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/scorer.py b/spacy/scorer.py index 19f157451..9c057d0a3 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -60,7 +60,7 @@ class ROCAUCScore(object): self.saved_score = _roc_auc_score(self.golds, self.cands) # catch ValueError: Only one class present in y_true. # ROC AUC score is not defined in that case. - except: + except ValueError: self.saved_score = -float("inf") self.saved_score_at_len = len(self.golds) return self.saved_score From 7e810cced64dee089fecc987194b182800006b6d Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Wed, 18 Sep 2019 19:57:21 +0200 Subject: [PATCH 184/207] Add references to docs pages --- spacy/lookups.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/spacy/lookups.py b/spacy/lookups.py index 7d100520f..c1f351fe5 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -29,6 +29,8 @@ class Lookups(object): """Initialize the Lookups object. RETURNS (Lookups): The newly created object. + + DOCS: https://spacy.io/api/lookups#init """ self._tables = OrderedDict() @@ -56,6 +58,8 @@ class Lookups(object): name (unicode): Unique name of table. data (dict): Optional data to add to the table. RETURNS (Table): The newly added table. + + DOCS: https://spacy.io/api/lookups#add_table """ if name in self.tables: raise ValueError(Errors.E158.format(name=name)) @@ -68,6 +72,8 @@ class Lookups(object): name (unicode): Name of the table. RETURNS (Table): The table. + + DOCS: https://spacy.io/api/lookups#get_table """ if name not in self._tables: raise KeyError(Errors.E159.format(name=name, tables=self.tables)) @@ -78,6 +84,8 @@ class Lookups(object): name (unicode): Name of the table to remove. RETURNS (Table): The removed table. + + DOCS: https://spacy.io/api/lookups#remove_table """ if name not in self._tables: raise KeyError(Errors.E159.format(name=name, tables=self.tables)) @@ -88,6 +96,8 @@ class Lookups(object): name (unicode): Name of the table. RETURNS (bool): Whether a table of that name exists. + + DOCS: https://spacy.io/api/lookups#has_table """ return name in self._tables @@ -95,6 +105,8 @@ class Lookups(object): """Serialize the lookups to a bytestring. RETURNS (bytes): The serialized Lookups. + + DOCS: https://spacy.io/api/lookups#to_bytes """ return srsly.msgpack_dumps(self._tables) @@ -103,6 +115,8 @@ class Lookups(object): bytes_data (bytes): The data to load. RETURNS (Lookups): The loaded Lookups. + + DOCS: https://spacy.io/api/lookups#from_bytes """ for key, value in srsly.msgpack_loads(bytes_data).items(): self._tables[key] = Table(key) @@ -114,6 +128,8 @@ class Lookups(object): directory, which will be created if it doesn't exist. path (unicode / Path): The file path. + + DOCS: https://spacy.io/api/lookups#to_disk """ if len(self._tables): path = ensure_path(path) @@ -129,6 +145,8 @@ class Lookups(object): path (unicode / Path): The directory path. RETURNS (Lookups): The loaded lookups. + + DOCS: https://spacy.io/api/lookups#from_disk """ path = ensure_path(path) filepath = path / "lookups.bin" @@ -153,6 +171,8 @@ class Table(OrderedDict): data (dict): The dictionary. name (unicode): Optional table name for reference. RETURNS (Table): The newly created object. + + DOCS: https://spacy.io/api/lookups#table.from_dict """ self = cls(name=name) self.update(data) @@ -164,6 +184,8 @@ class Table(OrderedDict): name (unicode): Optional table name for reference. data (dict): Initial data, used to hint Bloom Filter. RETURNS (Table): The newly created object. + + DOCS: https://spacy.io/api/lookups#table.init """ OrderedDict.__init__(self) self.name = name @@ -228,6 +250,8 @@ class Table(OrderedDict): """Serialize table to a bytestring. RETURNS (bytes): The serialized table. + + DOCS: https://spacy.io/api/lookups#table.to_bytes """ data = [ ("name", self.name), @@ -241,6 +265,8 @@ class Table(OrderedDict): bytes_data (bytes): The data to load. RETURNS (Table): The loaded table. + + DOCS: https://spacy.io/api/lookups#table.from_bytes """ loaded = srsly.msgpack_loads(bytes_data) data = loaded.get("dict", {}) From d62690b3ba2f2f29b9dd844a547bf1056fb29f14 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Wed, 18 Sep 2019 19:57:36 +0200 Subject: [PATCH 185/207] Update examples --- website/docs/usage/v2-2.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/website/docs/usage/v2-2.md b/website/docs/usage/v2-2.md index b6bd308e0..868430908 100644 --- a/website/docs/usage/v2-2.md +++ b/website/docs/usage/v2-2.md @@ -14,7 +14,7 @@ menu: > #### Example > -> ```python +> ```bash > python -m spacy download nl_core_news_sm > python -m spacy download nb_core_news_sm > python -m spacy download nb_core_news_md @@ -99,9 +99,9 @@ on disk**. > #### Example > > ```bash -> spacy train en /path/to/output /path/to/train /path/to/dev \ -> --pipeline textcat \ -> --textcat-arch simple_cnn --textcat-multilabel +> $ python -m spacy train en /output /train /dev \\ +> --pipeline textcat --textcat-arch simple_cnn \\ +> --textcat-multilabel > ``` When training your models using the `spacy train` command, you can now also From dd1810f05ad2211c03aae94fc6e46a7462a7c952 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Wed, 18 Sep 2019 20:23:21 +0200 Subject: [PATCH 186/207] Update DocBin and add docs --- spacy/errors.py | 2 + spacy/tokens/_serialize.py | 71 ++++++++++--- website/docs/api/docbin.md | 149 +++++++++++++++++++++++++++ website/docs/usage/saving-loading.md | 33 ++++++ website/docs/usage/v2-2.md | 34 +++--- website/meta/sidebars.json | 3 +- 6 files changed, 262 insertions(+), 30 deletions(-) create mode 100644 website/docs/api/docbin.md diff --git a/spacy/errors.py b/spacy/errors.py index b2a201773..f3234a06b 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -467,6 +467,8 @@ class Errors(object): E164 = ("x is neither increasing nor decreasing: {}.") E165 = ("Only one class present in y_true. ROC AUC score is not defined in " "that case.") + E166 = ("Can only merge DocBins with the same pre-defined attributes.\n" + "Current DocBin: {current}\nOther DocBin: {other}") @add_codes diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 8e4e24d46..634d7450a 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -8,7 +8,8 @@ from thinc.neural.ops import NumpyOps from ..compat import copy_reg from ..tokens import Doc -from ..attrs import SPACY, ORTH +from ..attrs import SPACY, ORTH, intify_attrs +from ..errors import Errors class DocBin(object): @@ -38,33 +39,46 @@ class DocBin(object): documents together, because you have less duplication in the strings. A notable downside to this format is that you can't easily extract just one - document from the pallet. + document from the DocBin. """ def __init__(self, attrs=None, store_user_data=False): - """Create a DocBin object, to hold serialized annotations. + """Create a DocBin object to hold serialized annotations. attrs (list): List of attributes to serialize. 'orth' and 'spacy' are always serialized, so they're not required. Defaults to None. + store_user_data (bool): Whether to include the `Doc.user_data`. + RETURNS (DocBin): The newly constructed object. + + DOCS: https://spacy.io/api/docbin#init """ attrs = attrs or [] - # Ensure ORTH is always attrs[0] + attrs = sorted(intify_attrs(attrs)) self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] - self.attrs.insert(0, ORTH) + self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0] self.tokens = [] self.spaces = [] self.user_data = [] self.strings = set() self.store_user_data = store_user_data + def __len__(self): + """RETURNS: The number of Doc objects added to the DocBin.""" + return len(self.tokens) + def add(self, doc): - """Add a doc's annotations to the DocBin for serialization.""" + """Add a Doc's annotations to the DocBin for serialization. + + doc (Doc): The Doc object to add. + + DOCS: https://spacy.io/api/docbin#add + """ array = doc.to_array(self.attrs) if len(array.shape) == 1: array = array.reshape((array.shape[0], 1)) self.tokens.append(array) spaces = doc.to_array(SPACY) - assert array.shape[0] == spaces.shape[0] + assert array.shape[0] == spaces.shape[0] # this should never happen spaces = spaces.reshape((spaces.shape[0], 1)) self.spaces.append(numpy.asarray(spaces, dtype=bool)) self.strings.update(w.text for w in doc) @@ -72,7 +86,13 @@ class DocBin(object): self.user_data.append(srsly.msgpack_dumps(doc.user_data)) def get_docs(self, vocab): - """Recover Doc objects from the annotations, using the given vocab.""" + """Recover Doc objects from the annotations, using the given vocab. + + vocab (Vocab): The shared vocab. + YIELDS (Doc): The Doc objects. + + DOCS: https://spacy.io/api/docbin#get_docs + """ for string in self.strings: vocab[string] orth_col = self.attrs.index(ORTH) @@ -87,8 +107,16 @@ class DocBin(object): yield doc def merge(self, other): - """Extend the annotations of this DocBin with the annotations from another.""" - assert self.attrs == other.attrs + """Extend the annotations of this DocBin with the annotations from + another. Will raise an error if the pre-defined attrs of the two + DocBins don't match. + + other (DocBin): The DocBin to merge into the current bin. + + DOCS: https://spacy.io/api/docbin#merge + """ + if self.attrs != other.attrs: + raise ValueError(Errors.E166.format(current=self.attrs, other=other.attrs)) self.tokens.extend(other.tokens) self.spaces.extend(other.spaces) self.strings.update(other.strings) @@ -96,9 +124,14 @@ class DocBin(object): self.user_data.extend(other.user_data) def to_bytes(self): - """Serialize the DocBin's annotations into a byte string.""" + """Serialize the DocBin's annotations to a bytestring. + + RETURNS (bytes): The serialized DocBin. + + DOCS: https://spacy.io/api/docbin#to_bytes + """ for tokens in self.tokens: - assert len(tokens.shape) == 2, tokens.shape + assert len(tokens.shape) == 2, tokens.shape # this should never happen lengths = [len(tokens) for tokens in self.tokens] msg = { "attrs": self.attrs, @@ -111,9 +144,15 @@ class DocBin(object): msg["user_data"] = self.user_data return gzip.compress(srsly.msgpack_dumps(msg)) - def from_bytes(self, string): - """Deserialize the DocBin's annotations from a byte string.""" - msg = srsly.msgpack_loads(gzip.decompress(string)) + def from_bytes(self, bytes_data): + """Deserialize the DocBin's annotations from a bytestring. + + bytes_data (bytes): The data to load from. + RETURNS (DocBin): The loaded DocBin. + + DOCS: https://spacy.io/api/docbin#from_bytes + """ + msg = srsly.msgpack_loads(gzip.decompress(bytes_data)) self.attrs = msg["attrs"] self.strings = set(msg["strings"]) lengths = numpy.fromstring(msg["lengths"], dtype="int32") @@ -127,7 +166,7 @@ class DocBin(object): if self.store_user_data and "user_data" in msg: self.user_data = list(msg["user_data"]) for tokens in self.tokens: - assert len(tokens.shape) == 2, tokens.shape + assert len(tokens.shape) == 2, tokens.shape # this should never happen return self diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md new file mode 100644 index 000000000..a4525906e --- /dev/null +++ b/website/docs/api/docbin.md @@ -0,0 +1,149 @@ +--- +title: DocBin +tag: class +new: 2.2 +teaser: Pack Doc objects for binary serialization +source: spacy/tokens/_serialize.py +--- + +The `DocBin` class lets you efficiently serialize the information from a +collection of `Doc` objects. You can control which information is serialized by +passing a list of attribute IDs, and optionally also specify whether the user +data is serialized. The `DocBin` is faster and produces smaller data sizes than +pickle, and allows you to deserialize without executing arbitrary Python code. A +notable downside to this format is that you can't easily extract just one +document from the `DocBin`. The serialization format is gzipped msgpack, where +the msgpack object has the following structure: + +```python +### msgpack object strcutrue +{ + "attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE] + "tokens": bytes, # Serialized numpy uint64 array with the token data + "spaces": bytes, # Serialized numpy boolean array with spaces data + "lengths": bytes, # Serialized numpy int32 array with the doc lengths + "strings": List[unicode] # List of unique strings in the token data +} +``` + +Strings for the words, tags, labels etc are represented by 64-bit hashes in the +token data, and every string that occurs at least once is passed via the strings +object. This means the storage is more efficient if you pack more documents +together, because you have less duplication in the strings. For usage examples, +see the docs on [serializing `Doc` objects](/usage/saving-loading#docs). + +## DocBin.\_\_init\_\_ {#init tag="method"} + +Create a `DocBin` object to hold serialized annotations. + +> #### Example +> +> ```python +> from spacy.tokens import DocBin +> doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"]) +> ``` + +| Argument | Type | Description | +| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `attrs` | list | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. | +| `store_user_data` | bool | Whether to include the `Doc.user_data`. Defaults to `False`. | +| **RETURNS** | `DocBin` | The newly constructed object. | + +## DocBin.\_\len\_\_ {#len tag="method"} + +Get the number of `Doc` objects that were added to the `DocBin`. + +> #### Example +> +> ```python +> doc_bin = DocBin(attrs=["LEMMA"]) +> doc = nlp("This is a document to serialize.") +> doc_bin.add(doc) +> assert len(doc_bin) == 1 +> ``` + +| Argument | Type | Description | +| ----------- | ---- | ------------------------------------------- | +| **RETURNS** | int | The number of `Doc`s added to the `DocBin`. | + +## DocBin.add {#add tag="method"} + +Add a `Doc`'s annotations to the `DocBin` for serialization. + +> #### Example +> +> ```python +> doc_bin = DocBin(attrs=["LEMMA"]) +> doc = nlp("This is a document to serialize.") +> doc_bin.add(doc) +> ``` + +| Argument | Type | Description | +| -------- | ----- | ------------------------ | +| `doc` | `Doc` | The `Doc` object to add. | + +## DocBin.get_docs {#get_docs tag="method"} + +Recover `Doc` objects from the annotations, using the given vocab. + +> #### Example +> +> ```python +> docs = list(doc_bin.get_docs(nlp.vocab)) +> ``` + +| Argument | Type | Description | +| ---------- | ------- | ------------------ | +| `vocab` | `Vocab` | The shared vocab. | +| **YIELDS** | `Doc` | The `Doc` objects. | + +## DocBin.merge {#merge tag="method"} + +Extend the annotations of this `DocBin` with the annotations from another. Will +raise an error if the pre-defined attrs of the two `DocBin`s don't match. + +> #### Example +> +> ```python +> doc_bin1 = DocBin(attrs=["LEMMA", "POS"]) +> doc_bin1.add(nlp("Hello world")) +> doc_bin2 = DocBin(attrs=["LEMMA", "POS"]) +> doc_bin2.add(nlp("This is a sentence")) +> merged_bins = doc_bin1.merge(doc_bin2) +> assert len(merged_bins) == 2 +> ``` + +| Argument | Type | Description | +| -------- | -------- | ------------------------------------------- | +| `other` | `DocBin` | The `DocBin` to merge into the current bin. | + +## DocBin.to_bytes {#to_bytes tag="method"} + +Serialize the `DocBin`'s annotations to a bytestring. + +> #### Example +> +> ```python +> doc_bin = DocBin(attrs=["DEP", "HEAD"]) +> doc_bin_bytes = doc_bin.to_bytes() +> ``` + +| Argument | Type | Description | +| ----------- | ----- | ------------------------ | +| **RETURNS** | bytes | The serialized `DocBin`. | + +## DocBin.from_bytes {#from_bytes tag="method"} + +Deserialize the `DocBin`'s annotations from a bytestring. + +> #### Example +> +> ```python +> doc_bin_bytes = doc_bin.to_bytes() +> new_doc_bin = DocBin().from_bytes(doc_bin_bytes) +> ``` + +| Argument | Type | Description | +| ------------ | -------- | ---------------------- | +| `bytes_data` | bytes | The data to load from. | +| **RETURNS** | `DocBin` | The loaded `DocBin`. | diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index d592277aa..3d904f01a 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -59,6 +59,39 @@ initializes the language class, creates and adds the pipeline components and _then_ loads in the binary data. You can read more about this process [here](/usage/processing-pipelines#pipelines). +### Serializing Doc objects efficiently {#docs new="2.2"} + +If you're working with lots of data, you'll probably need to pass analyses +between machines, either to use something like [Dask](https://dask.org) or +[Spark](https://spark.apache.org), or even just to save out work to disk. Often +it's sufficient to use the [`Doc.to_array`](/api/doc#to_array) functionality for +this, and just serialize the numpy arrays – but other times you want a more +general way to save and restore `Doc` objects. + +The [`DocBin`](/api/docbin) class makes it easy to serialize and deserialize a +collection of `Doc` objects together, and is much more efficient than calling +[`Doc.to_bytes`](/api/doc#to_bytes) on each individual `Doc` object. You can +also control what data gets saved, and you can merge pallets together for easy +map/reduce-style processing. + +```python +### {highlight="4,8,9,13,14"} +import spacy +from spacy.tokens import DocBin + +doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) +texts = ["Some text", "Lots of texts...", "..."] +nlp = spacy.load("en_core_web_sm") +for doc in nlp.pipe(texts): + doc_bin.add(doc) +bytes_data = docbin.to_bytes() + +# Deserialize later, e.g. in a new process +nlp = spacy.blank("en") +doc_bin = DocBin().from_bytes(bytes_data) +docs = list(doc_bin.get_docs(nlp.vocab)) +``` + ### Using Pickle {#pickle} > #### Example diff --git a/website/docs/usage/v2-2.md b/website/docs/usage/v2-2.md index 868430908..376a9ae10 100644 --- a/website/docs/usage/v2-2.md +++ b/website/docs/usage/v2-2.md @@ -121,30 +121,38 @@ classification. ### New DocBin class to efficiently serialize Doc collections > #### Example -> +> > ```python > from spacy.tokens import DocBin -> doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=False) +> doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) > for doc in nlp.pipe(texts): > doc_bin.add(doc) -> byte_data = docbin.to_bytes() +> bytes_data = doc_bin.to_bytes() > # Deserialize later, e.g. in a new process > nlp = spacy.blank("en") -> doc_bin = DocBin() +> doc_bin = DocBin().from_bytes(bytes_data) > docs = list(doc_bin.get_docs(nlp.vocab)) > ``` If you're working with lots of data, you'll probably need to pass analyses -between machines, either to use something like Dask or Spark, or even just to -save out work to disk. Often it's sufficient to use the doc.to_array() -functionality for this, and just serialize the numpy arrays --- but other times -you want a more general way to save and restore `Doc` objects. +between machines, either to use something like [Dask](https://dask.org) or +[Spark](https://spark.apache.org), or even just to save out work to disk. Often +it's sufficient to use the `Doc.to_array` functionality for this, and just +serialize the numpy arrays – but other times you want a more general way to save +and restore `Doc` objects. -The new `DocBin` class makes it easy to serialize and deserialize -a collection of `Doc` objects together, and is much more efficient than -calling `doc.to_bytes()` on each individual `Doc` object. You can also control -what data gets saved, and you can merge pallets together for easy -map/reduce-style processing. +The new `DocBin` class makes it easy to serialize and deserialize a collection +of `Doc` objects together, and is much more efficient than calling +`Doc.to_bytes` on each individual `Doc` object. You can also control what data +gets saved, and you can merge pallets together for easy map/reduce-style +processing. + +<Infobox> + +**API:** [`DocBin`](/api/docbin) **Usage: ** +[Serializing Doc objects](/usage/saving-loading#docs) + +</Infobox> ### CLI command to debug and validate training data {#debug-data} diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 7c6affe70..68d46605f 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -95,7 +95,8 @@ { "text": "KnowledgeBase", "url": "/api/kb" }, { "text": "GoldParse", "url": "/api/goldparse" }, { "text": "GoldCorpus", "url": "/api/goldcorpus" }, - { "text": "Scorer", "url": "/api/scorer" } + { "text": "Scorer", "url": "/api/scorer" }, + { "text": "DocBin", "url": "/api/docbin" } ] }, { From f2c8b1e3629ba23f1db62352868cb95374d42cdb Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Wed, 18 Sep 2019 20:24:41 +0200 Subject: [PATCH 187/207] Simplify lookup hashing Just use get_string_id, which already does everything ensure_hash was supposed to do --- spacy/lookups.py | 15 ++++----------- spacy/tests/vocab_vectors/test_lookups.py | 13 +++++++------ 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/spacy/lookups.py b/spacy/lookups.py index c1f351fe5..05a60f289 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -7,16 +7,9 @@ from preshed.bloom import BloomFilter from .errors import Errors from .util import SimpleFrozenDict, ensure_path -from .compat import basestring_ from .strings import get_string_id -def ensure_hash(key): - if isinstance(key, basestring_): - return get_string_id(key) - return key - - class Lookups(object): """Container for large lookup tables and dictionaries, e.g. lemmatization data or tokenizer exception lists. Lookups are available via vocab.lookups, @@ -202,7 +195,7 @@ class Table(OrderedDict): key (unicode / int): The key to set. value: The value to set. """ - key = ensure_hash(key) + key = get_string_id(key) OrderedDict.__setitem__(self, key, value) self.bloom.add(key) @@ -221,7 +214,7 @@ class Table(OrderedDict): key (unicode / int): The key to get. RETURNS: The value. """ - key = ensure_hash(key) + key = get_string_id(key) return OrderedDict.__getitem__(self, key) def get(self, key, default=None): @@ -231,7 +224,7 @@ class Table(OrderedDict): default: The default value to return. RETURNS: The value. """ - key = ensure_hash(key) + key = get_string_id(key) return OrderedDict.get(self, key, default) def __contains__(self, key): @@ -240,7 +233,7 @@ class Table(OrderedDict): key (unicode / int): The key to check. RETURNS (bool): Whether the key is in the table. """ - key = ensure_hash(key) + key = get_string_id(key) # This can give a false positive, so we need to check it after if key not in self.bloom: return False diff --git a/spacy/tests/vocab_vectors/test_lookups.py b/spacy/tests/vocab_vectors/test_lookups.py index 02f25532a..f78dd33c4 100644 --- a/spacy/tests/vocab_vectors/test_lookups.py +++ b/spacy/tests/vocab_vectors/test_lookups.py @@ -2,7 +2,8 @@ from __future__ import unicode_literals import pytest -from spacy.lookups import Lookups, Table, ensure_hash +from spacy.lookups import Lookups, Table +from spacy.strings import get_string_id from spacy.vocab import Vocab from ..util import make_tempdir @@ -45,17 +46,17 @@ def test_table_api(): table = Table(name="table", data=data) assert len(table) == len(data) assert "foo" in table - assert ensure_hash("foo") in table + assert get_string_id("foo") in table assert table["foo"] == "bar" - assert table[ensure_hash("foo")] == "bar" + assert table[get_string_id("foo")] == "bar" assert table.get("foo") == "bar" assert table.get("abc") is None table["abc"] = 123 assert table["abc"] == 123 - assert table[ensure_hash("abc")] == 123 + assert table[get_string_id("abc")] == 123 table.set("def", 456) assert table["def"] == 456 - assert table[ensure_hash("def")] == 456 + assert table[get_string_id("def")] == 456 def test_table_api_to_from_bytes(): @@ -66,7 +67,7 @@ def test_table_api_to_from_bytes(): assert new_table.name == "table" assert len(new_table) == 3 assert new_table["foo"] == "bar" - assert new_table[ensure_hash("foo")] == "bar" + assert new_table[get_string_id("foo")] == "bar" new_table2 = Table(data={"def": 456}) new_table2.from_bytes(table_bytes) assert len(new_table2) == 3 From 00a8cbc306793d94d681de7dccd59065448bb05c Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Wed, 18 Sep 2019 20:27:03 +0200 Subject: [PATCH 188/207] Tidy up and auto-format --- spacy/cli/debug_data.py | 8 ++++---- spacy/cli/train.py | 14 ++++++++------ spacy/lang/char_classes.py | 4 +++- spacy/language.py | 1 - spacy/tests/test_scorer.py | 13 +++++-------- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 979010533..1a0bb9c04 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -293,12 +293,13 @@ def debug_data( "The train and dev labels are not the same. " "Train labels: {}. " "Dev labels: {}.".format( - _format_labels(gold_train_data["cats"]), + _format_labels(gold_train_data["cats"]), _format_labels(gold_dev_data["cats"]), - ) + ) ) if gold_train_data["n_cats_multilabel"] > 0: - msg.info("The train data contains instances without " + msg.info( + "The train data contains instances without " "mutually-exclusive classes. Use '--textcat-multilabel' " "when training." ) @@ -481,7 +482,6 @@ def debug_data( ) ) - msg.divider("Summary") good_counts = msg.counts[MESSAGES.GOOD] warn_counts = msg.counts[MESSAGES.WARN] diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 784a12320..e2bdddda3 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -182,7 +182,7 @@ def train( base_cfg = { "exclusive_classes": textcat_cfg["exclusive_classes"], "architecture": textcat_cfg["architecture"], - "positive_label": textcat_cfg["positive_label"] + "positive_label": textcat_cfg["positive_label"], } pipe_cfg = { "exclusive_classes": not textcat_multilabel, @@ -190,12 +190,13 @@ def train( "positive_label": textcat_positive_label, } if base_cfg != pipe_cfg: - msg.fail("The base textcat model configuration does" + msg.fail( + "The base textcat model configuration does" "not match the provided training options. " "Existing cfg: {}, provided cfg: {}".format( base_cfg, pipe_cfg ), - exits=1 + exits=1, ) else: msg.text("Starting with blank model '{}'".format(lang)) @@ -298,9 +299,10 @@ def train( break if base_model and set(textcat_labels) != train_labels: msg.fail( - "Cannot extend textcat model using data with different " - "labels. Base model labels: {}, training data labels: " - "{}.".format(textcat_labels, list(train_labels)), exits=1 + "Cannot extend textcat model using data with different " + "labels. Base model labels: {}, training data labels: " + "{}.".format(textcat_labels, list(train_labels)), + exits=1, ) if textcat_multilabel: msg.text( diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 131bdcd51..cb5b50ffc 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -201,7 +201,9 @@ _ukrainian = r"а-щюяіїєґА-ЩЮЯІЇЄҐ" _upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper _lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower -_uncased = _bengali + _hebrew + _persian + _sinhala + _hindi + _kannada + _tamil + _telugu +_uncased = ( + _bengali + _hebrew + _persian + _sinhala + _hindi + _kannada + _tamil + _telugu +) ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased) ALPHA_LOWER = group_chars(_lower + _uncased) diff --git a/spacy/language.py b/spacy/language.py index 4f94c39f6..da58d1e76 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -32,7 +32,6 @@ from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tag_map import TAG_MAP from .lang.lex_attrs import LEX_ATTRS, is_stop from .errors import Errors, Warnings, deprecation_warning -from .strings import hash_string from . import util from . import about diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 31d9cf875..9cc4f75b2 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -1,11 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import numpy as np from numpy.testing import assert_almost_equal, assert_array_almost_equal import pytest from pytest import approx -from spacy.errors import Errors from spacy.gold import GoldParse from spacy.scorer import Scorer, ROCAUCScore from spacy.scorer import _roc_auc_score, _roc_curve @@ -81,7 +79,7 @@ def test_roc_auc_score(): roc_auc = _roc_auc_score(y_true, y_score) assert_array_almost_equal(tpr, [0, 0, 1]) assert_array_almost_equal(fpr, [0, 1, 1]) - assert_almost_equal(roc_auc, 1.) + assert_almost_equal(roc_auc, 1.0) y_true = [0, 1] y_score = [1, 0] @@ -89,7 +87,7 @@ def test_roc_auc_score(): roc_auc = _roc_auc_score(y_true, y_score) assert_array_almost_equal(tpr, [0, 1, 1]) assert_array_almost_equal(fpr, [0, 0, 1]) - assert_almost_equal(roc_auc, 0.) + assert_almost_equal(roc_auc, 0.0) y_true = [1, 0] y_score = [1, 1] @@ -105,7 +103,7 @@ def test_roc_auc_score(): roc_auc = _roc_auc_score(y_true, y_score) assert_array_almost_equal(tpr, [0, 0, 1]) assert_array_almost_equal(fpr, [0, 1, 1]) - assert_almost_equal(roc_auc, 1.) + assert_almost_equal(roc_auc, 1.0) y_true = [1, 0] y_score = [0.5, 0.5] @@ -113,14 +111,13 @@ def test_roc_auc_score(): roc_auc = _roc_auc_score(y_true, y_score) assert_array_almost_equal(tpr, [0, 1]) assert_array_almost_equal(fpr, [0, 1]) - assert_almost_equal(roc_auc, .5) + assert_almost_equal(roc_auc, 0.5) # same result as above with ROCAUCScore wrapper score = ROCAUCScore() score.score_set(0.5, 1) score.score_set(0.5, 0) - assert_almost_equal(score.score, .5) - + assert_almost_equal(score.score, 0.5) # check that errors are raised in undefined cases and score is -inf y_true = [0, 0] From 6ebdc5f7d2c49714b81ebb5e2066f2a2ea9b0aad Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Wed, 18 Sep 2019 21:21:39 +0200 Subject: [PATCH 189/207] Update download docs [ci skip] --- website/docs/api/cli.md | 80 ++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 0ce4475b7..fd4e49d4a 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -23,11 +23,11 @@ type `spacy --help`. ## Download {#download} Download [models](/usage/models) for spaCy. The downloader finds the -best-matching compatible version, uses pip to download the model as a package -and automatically creates a [shortcut link](/usage/models#usage) to load the -model by name. Direct downloads don't perform any compatibility checks and -require the model name to be specified with its version (e.g. -`en_core_web_sm-2.0.0`). +best-matching compatible version, uses `pip install` to download the model as a +package and creates a [shortcut link](/usage/models#usage) if the model was +downloaded via a shortcut. Direct downloads don't perform any compatibility +checks and require the model name to be specified with its version (e.g. +`en_core_web_sm-2.2.0`). > #### Downloading best practices > @@ -40,16 +40,16 @@ require the model name to be specified with its version (e.g. > also allow you to add it as a versioned package dependency to your project. ```bash -$ python -m spacy download [model] [--direct] +$ python -m spacy download [model] [--direct] [pip args] ``` -| Argument | Type | Description | -| ---------------------------------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `model` | positional | Model name or shortcut (`en`, `de`, `en_core_web_sm`). | -| `--direct`, `-d` | flag | Force direct download of exact model version. | -| other <Tag variant="new">2.1</Tag> | - | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **CREATES** | directory, symlink | The installed model package in your `site-packages` directory and a shortcut link as a symlink in `spacy/data`. | +| Argument | Type | Description | +| ------------------------------------- | ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `model` | positional | Model name or shortcut (`en`, `de`, `en_core_web_sm`). | +| `--direct`, `-d` | flag | Force direct download of exact model version. | +| pip args <Tag variant="new">2.1</Tag> | - | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory or `--no-deps` to not install model dependencies. | +| `--help`, `-h` | flag | Show help message and available arguments. | +| **CREATES** | directory, symlink | The installed model package in your `site-packages` directory and a shortcut link as a symlink in `spacy/data` if installed via shortcut. | ## Link {#link} @@ -368,33 +368,33 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] [--verbose] ``` -| Argument | Type | Description | -| ----------------------------------------------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lang` | positional | Model language. | -| `output_path` | positional | Directory to store model in. Will be created if it doesn't exist. | -| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | -| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | -| `--base-model`, `-b` <Tag variant="new">2.1</Tag> | option | Optional name of base model to update. Can be any loadable spaCy model. | -| `--pipeline`, `-p` <Tag variant="new">2.1</Tag> | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | -| `--vectors`, `-v` | option | Model to load vectors from. | -| `--n-iter`, `-n` | option | Number of iterations (default: `30`). | -| `--n-early-stopping`, `-ne` | option | Maximum number of training epochs without dev accuracy improvement. | -| `--n-examples`, `-ns` | option | Number of examples to use (defaults to `0` for all examples). | -| `--use-gpu`, `-g` | option | Whether to use GPU. Can be either `0`, `1` or `-1`. | -| `--version`, `-V` | option | Model version. Will be written out to the model's `meta.json` after training. | -| `--meta-path`, `-m` <Tag variant="new">2</Tag> | option | Optional path to model [`meta.json`](/usage/training#models-generating). All relevant properties like `lang`, `pipeline` and `spacy_version` will be overwritten. | -| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. | -| `--parser-multitasks`, `-pt` | option | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'` | -| `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` | -| `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. | -| `--gold-preproc`, `-G` | flag | Use gold preprocessing. | -| `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging ] subtokens. Typically used for languages like Chinese. | -| `--textcat-multilabel`, `-TML` <Tag variant="new">2.2</Tag> | flag | Text classification classes aren't mutually exclusive (multilabel). | -| `--textcat-arch`, `-ta` <Tag variant="new">2.2</Tag> | option | Text classification model architecture. Defaults to `"bow"`. | -| `--textcat-positive-label`, `-tpl` <Tag variant="new">2.2</Tag> | option |Text classification positive label for binary classes with two labels. | -| `--verbose`, `-VV` <Tag variant="new">2.0.13</Tag> | flag | Show more detailed messages during training. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **CREATES** | model, pickle | A spaCy model on each epoch. | +| Argument | Type | Description | +| --------------------------------------------------------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `lang` | positional | Model language. | +| `output_path` | positional | Directory to store model in. Will be created if it doesn't exist. | +| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | +| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | +| `--base-model`, `-b` <Tag variant="new">2.1</Tag> | option | Optional name of base model to update. Can be any loadable spaCy model. | +| `--pipeline`, `-p` <Tag variant="new">2.1</Tag> | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | +| `--vectors`, `-v` | option | Model to load vectors from. | +| `--n-iter`, `-n` | option | Number of iterations (default: `30`). | +| `--n-early-stopping`, `-ne` | option | Maximum number of training epochs without dev accuracy improvement. | +| `--n-examples`, `-ns` | option | Number of examples to use (defaults to `0` for all examples). | +| `--use-gpu`, `-g` | option | Whether to use GPU. Can be either `0`, `1` or `-1`. | +| `--version`, `-V` | option | Model version. Will be written out to the model's `meta.json` after training. | +| `--meta-path`, `-m` <Tag variant="new">2</Tag> | option | Optional path to model [`meta.json`](/usage/training#models-generating). All relevant properties like `lang`, `pipeline` and `spacy_version` will be overwritten. | +| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. | +| `--parser-multitasks`, `-pt` | option | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'` | +| `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` | +| `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. | +| `--gold-preproc`, `-G` | flag | Use gold preprocessing. | +| `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging ] subtokens. Typically used for languages like Chinese. | +| `--textcat-multilabel`, `-TML` <Tag variant="new">2.2</Tag> | flag | Text classification classes aren't mutually exclusive (multilabel). | +| `--textcat-arch`, `-ta` <Tag variant="new">2.2</Tag> | option | Text classification model architecture. Defaults to `"bow"`. | +| `--textcat-positive-label`, `-tpl` <Tag variant="new">2.2</Tag> | option | Text classification positive label for binary classes with two labels. | +| `--verbose`, `-VV` <Tag variant="new">2.0.13</Tag> | flag | Show more detailed messages during training. | +| `--help`, `-h` | flag | Show help message and available arguments. | +| **CREATES** | model, pickle | A spaCy model on each epoch. | ### Environment variables for hyperparameters {#train-hyperparams new="2"} From f873548f6c26a91715beaf91d04831e29e60df2f Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Wed, 18 Sep 2019 21:21:48 +0200 Subject: [PATCH 190/207] Add backwards incompatibility [ci skip] --- website/docs/usage/v2-2.md | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/v2-2.md b/website/docs/usage/v2-2.md index 376a9ae10..6c2d3c158 100644 --- a/website/docs/usage/v2-2.md +++ b/website/docs/usage/v2-2.md @@ -326,4 +326,33 @@ check if all of your models are up to date, you can run the </Infobox> -<!-- TODO: copy from release notes once they're ready --> +- The Dutch models have been trained on a new NER corpus (custom labelled UD + instead of WikiNER), so their predictions may be very different compared to + the previous version. The results should be significantly better and more + generalizable, though. +- The `spacy download` command does **not** set the `--no-deps` pip argument + anymore by default, meaning that model package dependencies (if available) + will now be also downloaded and installed. If spaCy (which is also a model + dependency) is not installed in the current environment, e.g. if a user has + built from source, `--no-deps` is added back automatically to prevent spaCy + from being downloaded and installed again from pip. +- The built-in `biluo_tags_from_offsets` converter is now stricter and will + raise an error if entities are overlapping (instead of silently skipping + them). If your data contains invalid entity annotations, make sure to clean it + and resolve conflicts. You can now also use the new `debug-data` command to + find problems in your data. +- The default punctuation in the `sentencizer` has been extended and now + includes more characters common in various languages. This also means that the + results it produces may change, depending on your text. If you want the + previous behaviour with limited characters, set `punct_chars=[".", "!", "?"]` + on initialization. +- Lemmatization tables (rules, exceptions, index and lookups) are now part of + the `Vocab` and serialized with it. This means that serialized objects (`nlp`, + pipeline components, vocab) will now include additional data, and models + written to disk will include additional files. +- The `Serbian` language class (introduced in v2.1.8) incorrectly used the + language code `rs` instead of `sr`. This has now been fixed, so `Serbian` is + now available via `spacy.lang.sr`. +- The `"sources"` in the `meta.json` have changed from a list of strings to a + list of dicts. This is mostly internals, but if your code used + `nlp.meta["sources"]`, you might have to update it. From 9c940eab94cd781ef7113b2ae3604e6585ad3d04 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Wed, 18 Sep 2019 21:23:26 +0200 Subject: [PATCH 191/207] Update version in examples [ci skip] --- README.md | 4 ++-- website/docs/usage/models.md | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index aa2d059c1..6bdbc7e46 100644 --- a/README.md +++ b/README.md @@ -175,8 +175,8 @@ python -m spacy download en_core_web_sm python -m spacy download en # pip install .tar.gz archive from path or URL -pip install /Users/you/en_core_web_sm-2.1.0.tar.gz -pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz +pip install /Users/you/en_core_web_sm-2.2.0.tar.gz +pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz ``` ### Loading and using models diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index a8a478949..c9b22279d 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -106,7 +106,7 @@ python -m spacy download en_core_web_sm python -m spacy download en # Download exact model version (doesn't create shortcut link) -python -m spacy download en_core_web_sm-2.1.0 --direct +python -m spacy download en_core_web_sm-2.2.0 --direct ``` The download command will [install the model](/usage/models#download-pip) via @@ -145,10 +145,10 @@ click on the archive link and copy it to your clipboard. ```bash # With external URL -pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz +pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz # With local file -pip install /Users/you/en_core_web_sm-2.1.0.tar.gz +pip install /Users/you/en_core_web_sm-2.2.0.tar.gz ``` By default, this will install the model into your `site-packages` directory. You @@ -173,13 +173,13 @@ model data. ```yaml ### Directory structure {highlight="7"} -└── en_core_web_md-2.1.0.tar.gz # downloaded archive +└── en_core_web_md-2.2.0.tar.gz # downloaded archive ├── meta.json # model meta data ├── setup.py # setup file for pip installation └── en_core_web_md # 📦 model package ├── __init__.py # init for pip installation ├── meta.json # model meta data - └── en_core_web_md-2.1.0 # model data + └── en_core_web_md-2.2.0 # model data ``` You can place the **model package directory** anywhere on your local file @@ -325,8 +325,8 @@ URLs. ```text ### requirements.txt -spacy>=2.0.0,<3.0.0 -https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#egg=en_core_web_sm +spacy>=2.2.0,<3.0.0 +https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz#egg=en_core_web_sm ``` Specifying `#egg=` with the package name tells pip which package to expect from From d84763727c3d453c25be30007ef255ce1ab8008e Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Wed, 18 Sep 2019 21:24:14 +0200 Subject: [PATCH 192/207] Remove unused setting [ci skip] --- website/meta/site.json | 1 - 1 file changed, 1 deletion(-) diff --git a/website/meta/site.json b/website/meta/site.json index edb60ab0c..0325e78ca 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -23,7 +23,6 @@ "apiKey": "371e26ed49d29a27bd36273dfdaf89af", "indexName": "spacy" }, - "spacyVersion": "2.1", "binderUrl": "ines/spacy-io-binder", "binderBranch": "live", "binderVersion": "2.1.8", From 63a584c6d4e0ce8b17892d65a01281ba055c56fb Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Wed, 18 Sep 2019 21:34:24 +0200 Subject: [PATCH 193/207] Update README.md [ci skip] --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6bdbc7e46..fe138ae1c 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ It's commercial open-source software, released under the MIT license. | --------------- | -------------------------------------------------------------- | | [spaCy 101] | New to spaCy? Here's everything you need to know! | | [Usage Guides] | How to use spaCy and its features. | -| [New in v2.1] | New features, backwards incompatibilities and migration guide. | +| [New in v2.2] | New features, backwards incompatibilities and migration guide. | | [API Reference] | The detailed reference for spaCy's API. | | [Models] | Download statistical language models for spaCy. | | [Universe] | Libraries, extensions, demos, books and courses. | @@ -38,7 +38,7 @@ It's commercial open-source software, released under the MIT license. | [Contribute] | How to contribute to the spaCy project and code base. | [spacy 101]: https://spacy.io/usage/spacy-101 -[new in v2.1]: https://spacy.io/usage/v2-1 +[new in v2.2]: https://spacy.io/usage/v2-2 [usage guides]: https://spacy.io/usage/ [api reference]: https://spacy.io/api/ [models]: https://spacy.io/models From e2047576c483a419f02ac0a2a5e77cd52c075699 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 18 Sep 2019 21:42:11 +0200 Subject: [PATCH 194/207] Fix merge conflict --- website/docs/api/cli.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 322e1c043..5d42f6fb8 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -181,8 +181,6 @@ All output files generated by this command are compatible with | `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. | | `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | | `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | -<<<<<<< HEAD - ## Debug data {#debug-data new="2.2"} Analyze, debug and validate your training and development data, get useful @@ -342,8 +340,6 @@ will not be available. ``` </Accordion> -======= ->>>>>>> master ## Train {#train} From 19d99fc9e7dfb9bbd5427cb439668ff618f67cb7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 18 Sep 2019 21:43:59 +0200 Subject: [PATCH 195/207] Set version to v2.2.0.dev7 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 4d3de2d40..f029af111 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # fmt: off __title__ = "spacy" -__version__ = "2.2.0.dev6" +__version__ = "2.2.0.dev7" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" From 42df49133d5a2152e7d0eb02cd7884c4579660be Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Wed, 18 Sep 2019 21:54:51 +0200 Subject: [PATCH 196/207] Also lower-case in orth variants --- spacy/gold.pyx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 4eaea80ed..b684c470c 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -272,12 +272,17 @@ class GoldCorpus(object): def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): if random.random() >= orth_variant_level: return raw, paragraph_tuples + if random.random() >= 0.5: + lower = True + raw = raw.lower() ndsv = nlp.Defaults.single_orth_variants ndpv = nlp.Defaults.paired_orth_variants # modify words in paragraph_tuples variant_paragraph_tuples = [] for sent_tuples, brackets in paragraph_tuples: ids, words, tags, heads, labels, ner = sent_tuples + if lower: + words = [w.lower() for w in words] # single variants punct_choices = [random.choice(x["variants"]) for x in ndsv] for word_idx in range(len(words)): From 7d510c833e355952a00b15a8c238c11b12c3853d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 19 Sep 2019 00:03:24 +0200 Subject: [PATCH 197/207] Fix orth replacement --- spacy/gold.pyx | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index b684c470c..4cc44f757 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -274,13 +274,14 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): return raw, paragraph_tuples if random.random() >= 0.5: lower = True - raw = raw.lower() + if raw is not None: + raw = raw.lower() ndsv = nlp.Defaults.single_orth_variants ndpv = nlp.Defaults.paired_orth_variants # modify words in paragraph_tuples variant_paragraph_tuples = [] for sent_tuples, brackets in paragraph_tuples: - ids, words, tags, heads, labels, ner = sent_tuples + ids, words, tags, heads, labels, ner, cats = sent_tuples if lower: words = [w.lower() for w in words] # single variants @@ -309,7 +310,7 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): pair_idx = pair.index(words[word_idx]) words[word_idx] = punct_choices[punct_idx][pair_idx] - variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner), brackets)) + variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner, cats), brackets)) # modify raw to match variant_paragraph_tuples if raw is not None: variants = [] @@ -328,7 +329,7 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0): variant_raw += raw[raw_idx] raw_idx += 1 for sent_tuples, brackets in variant_paragraph_tuples: - ids, words, tags, heads, labels, ner = sent_tuples + ids, words, tags, heads, labels, ner, cats = sent_tuples for word in words: match_found = False # add identical word From e34b4a38b0d9c1257911e9de73285a51fed9bd29 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 19 Sep 2019 00:56:07 +0200 Subject: [PATCH 198/207] Fix set labels meta --- spacy/cli/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index e2bdddda3..d32e4b79d 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -441,6 +441,7 @@ def train( } meta.setdefault("name", "model%d" % i) meta.setdefault("version", version) + meta["labels"] = nlp.meta["labels"] meta_loc = output_path / ("model%d" % i) / "meta.json" srsly.write_json(meta_loc, meta) util.set_env_log(verbose) From f52b8579534067ce536016622d844de88ea8c1f4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Thu, 19 Sep 2019 00:56:35 +0200 Subject: [PATCH 199/207] Update version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index f029af111..ac500fded 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # fmt: off __title__ = "spacy" -__version__ = "2.2.0.dev7" +__version__ = "2.2.0.dev8" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" From ddc09b08eda69eb819fff7f267304a239a098462 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 19 Sep 2019 00:58:30 +0200 Subject: [PATCH 200/207] Update v2-2.md [ci skip] --- website/docs/usage/v2-2.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/website/docs/usage/v2-2.md b/website/docs/usage/v2-2.md index 6c2d3c158..e3907f8ea 100644 --- a/website/docs/usage/v2-2.md +++ b/website/docs/usage/v2-2.md @@ -341,6 +341,11 @@ check if all of your models are up to date, you can run the them). If your data contains invalid entity annotations, make sure to clean it and resolve conflicts. You can now also use the new `debug-data` command to find problems in your data. +- Pipeline components can now overwrite IOB tags of tokens that are not yet part + of an entity. Once a token has an `ent_iob` value set, it won't be reset to an + "unset" state and will always have at least `O` assigned. `list(doc.ents)` now + actually keeps the annotations on the token level consistent, instead of + resetting `O` to an empty string. - The default punctuation in the `sentencizer` has been extended and now includes more characters common in various languages. This also means that the results it produces may change, depending on your text. If you want the From 8cd37636781d0c1a5470de942161e260f8690b9e Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 19 Sep 2019 01:02:25 +0200 Subject: [PATCH 201/207] Update about.py [ci skip] --- spacy/about.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/spacy/about.py b/spacy/about.py index ac500fded..2fa3f95da 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,13 +1,9 @@ -# inspired from: -# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/ -# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py # fmt: off - __title__ = "spacy" __version__ = "2.2.0.dev8" -__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" +__summary__ = "Industrial-strength Natural Language Processing (NLP) in Python" __uri__ = "https://spacy.io" -__author__ = "Explosion AI" +__author__ = "Explosion" __email__ = "contact@explosion.ai" __license__ = "MIT" __release__ = False From 80d554f2e2813aea41b0889b39d8f30f648af1ad Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 19 Sep 2019 01:14:42 +0200 Subject: [PATCH 202/207] Remove unsupported version [ci skip] --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 79d5c3898..0c6269bef 100755 --- a/setup.py +++ b/setup.py @@ -283,7 +283,6 @@ def setup_package(): "Programming Language :: Python :: 2", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", From f2d224756b95e6351b4dbff3367a6f823156c010 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 19 Sep 2019 12:52:26 +0200 Subject: [PATCH 203/207] Update README.md [ci skip] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fe138ae1c..96a352028 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ state-of-the-art speed, convolutional **neural network models** for tagging, parsing and **named entity recognition** and easy **deep learning** integration. It's commercial open-source software, released under the MIT license. -💫 **Version 2.1 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases) +💫 **Version 2.2 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases) [](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) [](https://travis-ci.org/explosion/spaCy) From 0f9e253a692d15bc47915fe0e6ceac50777fc6a6 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 19 Sep 2019 13:34:37 +0200 Subject: [PATCH 204/207] Update README.md [ci skip] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 96a352028..fe138ae1c 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ state-of-the-art speed, convolutional **neural network models** for tagging, parsing and **named entity recognition** and easy **deep learning** integration. It's commercial open-source software, released under the MIT license. -💫 **Version 2.2 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases) +💫 **Version 2.1 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases) [](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) [](https://travis-ci.org/explosion/spaCy) From c1030b1ad26338f5d85f90bbd569c01e460536b5 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 19 Sep 2019 13:35:12 +0200 Subject: [PATCH 205/207] Update README.md [ci skip] --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fe138ae1c..104dc2bdf 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ It's commercial open-source software, released under the MIT license. | --------------- | -------------------------------------------------------------- | | [spaCy 101] | New to spaCy? Here's everything you need to know! | | [Usage Guides] | How to use spaCy and its features. | -| [New in v2.2] | New features, backwards incompatibilities and migration guide. | +| [New in v2.1] | New features, backwards incompatibilities and migration guide. | | [API Reference] | The detailed reference for spaCy's API. | | [Models] | Download statistical language models for spaCy. | | [Universe] | Libraries, extensions, demos, books and courses. | @@ -38,7 +38,7 @@ It's commercial open-source software, released under the MIT license. | [Contribute] | How to contribute to the spaCy project and code base. | [spacy 101]: https://spacy.io/usage/spacy-101 -[new in v2.2]: https://spacy.io/usage/v2-2 +[new in v2.2]: https://spacy.io/usage/v2-1 [usage guides]: https://spacy.io/usage/ [api reference]: https://spacy.io/api/ [models]: https://spacy.io/models From 197406de1d7ca17909591b30aebde4eb9ddcd339 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 19 Sep 2019 14:33:58 +0200 Subject: [PATCH 206/207] Update v2-2.md [ci skip] --- website/docs/usage/v2-2.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/v2-2.md b/website/docs/usage/v2-2.md index e3907f8ea..ded0404a3 100644 --- a/website/docs/usage/v2-2.md +++ b/website/docs/usage/v2-2.md @@ -8,7 +8,17 @@ menu: ## New Features {#features hidden="true"} -<!-- TODO: summary --> +spaCy v2.2 features improved statistical models, new pretrained models for +Norwegian and Lithuanian, better Dutch NER, as well as a new mechanism for +storing language data that makes the installation about **15× smaller** on +disk. We've also added a new API for **entity linking**, a new class to +efficiently **serialize annotations**, built-in scoring and **CLI training for +text classification** and a new command to analyze and **debug training data**. +For the full changelog, see the +[release notes on GitHub](https://github.com/explosion/spaCy/releases/tag/v2.2.0). + +<!-- For more details and a behind-the-scenes look at the new release, +[see our blog post](https://explosion.ai/blog/spacy-v2-2). --> ### Better pretrained models and more languages {#models} From 9bf69bfbb20aa8682d5de4b9464f074e0a4731e0 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 19 Sep 2019 17:38:41 +0200 Subject: [PATCH 207/207] Remove test --- spacy/tests/parser/test_ner.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 2c5185056..4dc7542ed 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -82,19 +82,6 @@ def test_get_oracle_moves_negative_O(tsys, vocab): assert names -def test_doc_add_entities_set_ents_iob(en_vocab): - doc = Doc(en_vocab, words=["This", "is", "a", "lion"]) - ner = EntityRecognizer(en_vocab) - ner.begin_training([]) - ner(doc) - assert len(list(doc.ents)) == 0 - assert [w.ent_iob_ for w in doc] == (["O"] * len(doc)) - doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)] - assert [w.ent_iob_ for w in doc] == ["", "", "", "B"] - doc.ents = [(doc.vocab.strings["WORD"], 0, 2)] - assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""] - - def test_oracle_moves_missing_B(en_vocab): words = ["B", "52", "Bomber"] biluo_tags = [None, None, "L-PRODUCT"]