From b0f3ea2200ab62bae2482884dbcce8e8e376c1d1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 26 Oct 2017 12:38:23 +0200 Subject: [PATCH 1/6] Fix names of pipeline components NeuralDependencyParser --> DependencyParser NeuralEntityRecognizer --> EntityRecognizer TokenVectorEncoder --> Tensorizer NeuralLabeller --> MultitaskObjective --- spacy/language.py | 13 ++- spacy/pipeline.pxd | 21 ----- spacy/pipeline.pyx | 86 ++++--------------- spacy/tests/doc/test_add_entities.py | 3 +- spacy/tests/parser/test_add_label.py | 4 +- spacy/tests/parser/test_neural_parser.py | 2 +- spacy/tests/parser/test_preset_sbd.py | 4 +- spacy/tests/parser/test_to_from_bytes_disk.py | 6 +- .../serialize/test_serialize_parser_ner.py | 4 +- .../tests/serialize/test_serialize_tagger.py | 2 +- .../serialize/test_serialize_tensorizer.py | 2 +- 11 files changed, 35 insertions(+), 112 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 933ca772d..c4777898e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -18,8 +18,8 @@ from .tagger import Tagger from .lemmatizer import Lemmatizer from .syntax.parser import get_templates -from .pipeline import NeuralDependencyParser, TokenVectorEncoder, NeuralTagger -from .pipeline import NeuralEntityRecognizer, SimilarityHook, TextCategorizer +from .pipeline import DependencyParser, Tensorizer, Tagger +from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer from .compat import json_dumps, izip, copy_reg from .scorer import Scorer @@ -75,9 +75,6 @@ class BaseDefaults(object): infixes = tuple(TOKENIZER_INFIXES) tag_map = dict(TAG_MAP) tokenizer_exceptions = {} - parser_features = get_templates('parser') - entity_features = get_templates('ner') - tagger_features = Tagger.feature_templates # TODO -- fix this stop_words = set() lemma_rules = {} lemma_exc = {} @@ -102,9 +99,9 @@ class Language(object): factories = { 'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp), 'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg), - 'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg), - 'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg), - 'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg), + 'tagger': lambda nlp, **cfg: Tagger(nlp.vocab, **cfg), + 'parser': lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg), + 'ner': lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg), 'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), 'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg) } diff --git a/spacy/pipeline.pxd b/spacy/pipeline.pxd index e9b7f0f73..e69de29bb 100644 --- a/spacy/pipeline.pxd +++ b/spacy/pipeline.pxd @@ -1,21 +0,0 @@ -from .syntax.parser cimport Parser -#from .syntax.beam_parser cimport BeamParser -from .syntax.ner cimport BiluoPushDown -from .syntax.arc_eager cimport ArcEager -from .tagger cimport Tagger - - -cdef class EntityRecognizer(Parser): - pass - - -cdef class DependencyParser(Parser): - pass - - -#cdef class BeamEntityRecognizer(BeamParser): -# pass -# -# -#cdef class BeamDependencyParser(BeamParser): -# pass diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 7c1976dfa..6e4ef2f3e 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -26,11 +26,8 @@ from thinc.neural.util import to_categorical from thinc.neural._classes.difference import Siamese, CauchySimilarity from .tokens.doc cimport Doc -from .syntax.parser cimport Parser as LinearParser -from .syntax.nn_parser cimport Parser as NeuralParser +from .syntax.nn_parser cimport Parser from .syntax import nonproj -from .syntax.parser import get_templates as get_feature_templates -from .syntax.beam_parser cimport BeamParser from .syntax.ner cimport BiluoPushDown from .syntax.arc_eager cimport ArcEager from .tagger import Tagger @@ -217,7 +214,7 @@ def _load_cfg(path): return {} -class TokenVectorEncoder(BaseThincComponent): +class Tensorizer(BaseThincComponent): """Assign position-sensitive vectors to tokens, using a CNN or RNN.""" name = 'tensorizer' @@ -329,7 +326,7 @@ class TokenVectorEncoder(BaseThincComponent): link_vectors_to_models(self.vocab) -class NeuralTagger(BaseThincComponent): +class Tagger(BaseThincComponent): name = 'tagger' def __init__(self, vocab, model=True, **cfg): self.vocab = vocab @@ -513,7 +510,11 @@ class NeuralTagger(BaseThincComponent): return self -class NeuralLabeller(NeuralTagger): +class MultitaskObjective(Tagger): + '''Assist training of a parser or tagger, by training a side-objective. + + Experimental + ''' name = 'nn_labeller' def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg): self.vocab = vocab @@ -532,7 +533,7 @@ class NeuralLabeller(NeuralTagger): self.make_label = target else: raise ValueError( - "NeuralLabeller target should be function or one of " + "MultitaskObjective target should be function or one of " "['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']") self.cfg = dict(cfg) self.cfg.setdefault('cnn_maxout_pieces', 2) @@ -752,45 +753,7 @@ class TextCategorizer(BaseThincComponent): link_vectors_to_models(self.vocab) -cdef class EntityRecognizer(LinearParser): - """Annotate named entities on Doc objects.""" - TransitionSystem = BiluoPushDown - - feature_templates = get_feature_templates('ner') - - def add_label(self, label): - LinearParser.add_label(self, label) - if isinstance(label, basestring): - label = self.vocab.strings[label] - - -cdef class BeamEntityRecognizer(BeamParser): - """Annotate named entities on Doc objects.""" - TransitionSystem = BiluoPushDown - - feature_templates = get_feature_templates('ner') - - def add_label(self, label): - LinearParser.add_label(self, label) - if isinstance(label, basestring): - label = self.vocab.strings[label] - - -cdef class DependencyParser(LinearParser): - TransitionSystem = ArcEager - feature_templates = get_feature_templates('basic') - - def add_label(self, label): - LinearParser.add_label(self, label) - if isinstance(label, basestring): - label = self.vocab.strings[label] - - @property - def postprocesses(self): - return [nonproj.deprojectivize] - - -cdef class NeuralDependencyParser(NeuralParser): +cdef class DependencyParser(Parser): name = 'parser' TransitionSystem = ArcEager @@ -800,17 +763,17 @@ cdef class NeuralDependencyParser(NeuralParser): def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): for target in []: - labeller = NeuralLabeller(self.vocab, target=target) + labeller = MultitaskObjective(self.vocab, target=target) tok2vec = self.model[0] labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec) pipeline.append(labeller) self._multitasks.append(labeller) def __reduce__(self): - return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None) + return (DependencyParser, (self.vocab, self.moves, self.model), None, None) -cdef class NeuralEntityRecognizer(NeuralParser): +cdef class EntityRecognizer(Parser): name = 'ner' TransitionSystem = BiluoPushDown @@ -818,31 +781,14 @@ cdef class NeuralEntityRecognizer(NeuralParser): def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): for target in []: - labeller = NeuralLabeller(self.vocab, target=target) + labeller = MultitaskObjective(self.vocab, target=target) tok2vec = self.model[0] labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec) pipeline.append(labeller) self._multitasks.append(labeller) def __reduce__(self): - return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None) + return (EntityRecognizer, (self.vocab, self.moves, self.model), None, None) -cdef class BeamDependencyParser(BeamParser): - TransitionSystem = ArcEager - - feature_templates = get_feature_templates('basic') - - def add_label(self, label): - Parser.add_label(self, label) - if isinstance(label, basestring): - label = self.vocab.strings[label] - - @property - def postprocesses(self): - return [nonproj.deprojectivize] - - - -__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser', - 'BeamEntityRecognizer', 'TokenVectorEnoder'] +__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer'] diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index cc74aa0ae..cd444ba81 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -10,7 +10,8 @@ import pytest def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) - ner = EntityRecognizer(en_vocab, features=[(2,), (3,)]) + ner = EntityRecognizer(en_vocab) + ner.begin_training([]) ner(doc) assert len(list(doc.ents)) == 0 diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index 3fbfc96a6..c3bceb106 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -9,7 +9,7 @@ from ...attrs import NORM from ...gold import GoldParse from ...vocab import Vocab from ...tokens import Doc -from ...pipeline import NeuralDependencyParser +from ...pipeline import DependencyParser numpy.random.seed(0) @@ -21,7 +21,7 @@ def vocab(): @pytest.fixture def parser(vocab): - parser = NeuralDependencyParser(vocab) + parser = DependencyParser(vocab) parser.cfg['token_vector_width'] = 8 parser.cfg['hidden_width'] = 30 parser.cfg['hist_size'] = 0 diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index ae20cd5f0..e85c61276 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -6,7 +6,7 @@ import numpy from ..._ml import chain, Tok2Vec, doc2feats from ...vocab import Vocab -from ...pipeline import TokenVectorEncoder +from ...pipeline import Tensorizer from ...syntax.arc_eager import ArcEager from ...syntax.nn_parser import Parser from ...tokens.doc import Doc diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index 4c973bd97..9b8c98735 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -8,7 +8,7 @@ from ...attrs import NORM from ...gold import GoldParse from ...vocab import Vocab from ...tokens import Doc -from ...pipeline import NeuralDependencyParser +from ...pipeline import DependencyParser @pytest.fixture def vocab(): @@ -16,7 +16,7 @@ def vocab(): @pytest.fixture def parser(vocab): - parser = NeuralDependencyParser(vocab) + parser = DependencyParser(vocab) parser.cfg['token_vector_width'] = 4 parser.cfg['hidden_width'] = 32 #parser.add_label('right') diff --git a/spacy/tests/parser/test_to_from_bytes_disk.py b/spacy/tests/parser/test_to_from_bytes_disk.py index b0a10fa8e..48c412b7a 100644 --- a/spacy/tests/parser/test_to_from_bytes_disk.py +++ b/spacy/tests/parser/test_to_from_bytes_disk.py @@ -1,11 +1,11 @@ import pytest -from ...pipeline import NeuralDependencyParser +from ...pipeline import DependencyParser @pytest.fixture def parser(en_vocab): - parser = NeuralDependencyParser(en_vocab) + parser = DependencyParser(en_vocab) parser.add_label('nsubj') parser.model, cfg = parser.Model(parser.moves.n_moves) parser.cfg.update(cfg) @@ -14,7 +14,7 @@ def parser(en_vocab): @pytest.fixture def blank_parser(en_vocab): - parser = NeuralDependencyParser(en_vocab) + parser = DependencyParser(en_vocab) return parser diff --git a/spacy/tests/serialize/test_serialize_parser_ner.py b/spacy/tests/serialize/test_serialize_parser_ner.py index ae9e23e9a..cbe97b716 100644 --- a/spacy/tests/serialize/test_serialize_parser_ner.py +++ b/spacy/tests/serialize/test_serialize_parser_ner.py @@ -2,8 +2,8 @@ from __future__ import unicode_literals from ..util import make_tempdir -from ...pipeline import NeuralDependencyParser as DependencyParser -from ...pipeline import NeuralEntityRecognizer as EntityRecognizer +from ...pipeline import DependencyParser +from ...pipeline import EntityRecognizer import pytest diff --git a/spacy/tests/serialize/test_serialize_tagger.py b/spacy/tests/serialize/test_serialize_tagger.py index 475be1cef..7b7dedae0 100644 --- a/spacy/tests/serialize/test_serialize_tagger.py +++ b/spacy/tests/serialize/test_serialize_tagger.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ..util import make_tempdir -from ...pipeline import NeuralTagger as Tagger +from ...pipeline import Tagger import pytest diff --git a/spacy/tests/serialize/test_serialize_tensorizer.py b/spacy/tests/serialize/test_serialize_tensorizer.py index ba01a2fa6..bc751a686 100644 --- a/spacy/tests/serialize/test_serialize_tensorizer.py +++ b/spacy/tests/serialize/test_serialize_tensorizer.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ..util import make_tempdir -from ...pipeline import TokenVectorEncoder as Tensorizer +from ...pipeline import Tensorizer import pytest From a8abc47811e732ac49c402b0a0b41ca585d584c8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 26 Oct 2017 12:40:40 +0200 Subject: [PATCH 2/6] Rename BaseThincComponent --> Pipe --- spacy/pipeline.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 6e4ef2f3e..c52c29883 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -83,7 +83,7 @@ class SentenceSegmenter(object): yield doc[start : len(doc)] -class BaseThincComponent(object): +class Pipe(object): name = None @classmethod @@ -214,7 +214,7 @@ def _load_cfg(path): return {} -class Tensorizer(BaseThincComponent): +class Tensorizer(Pipe): """Assign position-sensitive vectors to tokens, using a CNN or RNN.""" name = 'tensorizer' @@ -326,7 +326,7 @@ class Tensorizer(BaseThincComponent): link_vectors_to_models(self.vocab) -class Tagger(BaseThincComponent): +class Tagger(Pipe): name = 'tagger' def __init__(self, vocab, model=True, **cfg): self.vocab = vocab @@ -623,7 +623,7 @@ class MultitaskObjective(Tagger): return '%s-%s' % (tags[i], ents[i]) -class SimilarityHook(BaseThincComponent): +class SimilarityHook(Pipe): """ Experimental @@ -675,7 +675,7 @@ class SimilarityHook(BaseThincComponent): link_vectors_to_models(self.vocab) -class TextCategorizer(BaseThincComponent): +class TextCategorizer(Pipe): name = 'textcat' @classmethod From 33f8c58782f96d787f862b32ead86f933a1a574e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 26 Oct 2017 12:42:05 +0200 Subject: [PATCH 3/6] Remove obsolete parser.pyx --- spacy/syntax/_parse_features.pxd | 259 --------------- spacy/syntax/_parse_features.pyx | 419 ------------------------ spacy/syntax/beam_parser.pxd | 10 - spacy/syntax/beam_parser.pyx | 239 -------------- spacy/syntax/parser.pxd | 24 -- spacy/syntax/parser.pyx | 526 ------------------------------- 6 files changed, 1477 deletions(-) delete mode 100644 spacy/syntax/_parse_features.pxd delete mode 100644 spacy/syntax/_parse_features.pyx delete mode 100644 spacy/syntax/beam_parser.pxd delete mode 100644 spacy/syntax/beam_parser.pyx delete mode 100644 spacy/syntax/parser.pxd delete mode 100644 spacy/syntax/parser.pyx diff --git a/spacy/syntax/_parse_features.pxd b/spacy/syntax/_parse_features.pxd deleted file mode 100644 index 0842e3504..000000000 --- a/spacy/syntax/_parse_features.pxd +++ /dev/null @@ -1,259 +0,0 @@ -from thinc.typedefs cimport atom_t - -from .stateclass cimport StateClass -from ._state cimport StateC - - -cdef int fill_context(atom_t* context, const StateC* state) nogil -# Context elements - -# Ensure each token's attributes are listed: w, p, c, c6, c4. The order -# is referenced by incrementing the enum... - -# Tokens are listed in left-to-right order. -#cdef size_t* SLOTS = [ -# S2w, S1w, -# S0l0w, S0l2w, S0lw, -# S0w, -# S0r0w, S0r2w, S0rw, -# N0l0w, N0l2w, N0lw, -# P2w, P1w, -# N0w, N1w, N2w, N3w, 0 -#] - -# NB: The order of the enum is _NOT_ arbitrary!! -cpdef enum: - S2w - S2W - S2p - S2c - S2c4 - S2c6 - S2L - S2_prefix - S2_suffix - S2_shape - S2_ne_iob - S2_ne_type - - S1w - S1W - S1p - S1c - S1c4 - S1c6 - S1L - S1_prefix - S1_suffix - S1_shape - S1_ne_iob - S1_ne_type - - S1rw - S1rW - S1rp - S1rc - S1rc4 - S1rc6 - S1rL - S1r_prefix - S1r_suffix - S1r_shape - S1r_ne_iob - S1r_ne_type - - S0lw - S0lW - S0lp - S0lc - S0lc4 - S0lc6 - S0lL - S0l_prefix - S0l_suffix - S0l_shape - S0l_ne_iob - S0l_ne_type - - S0l2w - S0l2W - S0l2p - S0l2c - S0l2c4 - S0l2c6 - S0l2L - S0l2_prefix - S0l2_suffix - S0l2_shape - S0l2_ne_iob - S0l2_ne_type - - S0w - S0W - S0p - S0c - S0c4 - S0c6 - S0L - S0_prefix - S0_suffix - S0_shape - S0_ne_iob - S0_ne_type - - S0r2w - S0r2W - S0r2p - S0r2c - S0r2c4 - S0r2c6 - S0r2L - S0r2_prefix - S0r2_suffix - S0r2_shape - S0r2_ne_iob - S0r2_ne_type - - S0rw - S0rW - S0rp - S0rc - S0rc4 - S0rc6 - S0rL - S0r_prefix - S0r_suffix - S0r_shape - S0r_ne_iob - S0r_ne_type - - N0l2w - N0l2W - N0l2p - N0l2c - N0l2c4 - N0l2c6 - N0l2L - N0l2_prefix - N0l2_suffix - N0l2_shape - N0l2_ne_iob - N0l2_ne_type - - N0lw - N0lW - N0lp - N0lc - N0lc4 - N0lc6 - N0lL - N0l_prefix - N0l_suffix - N0l_shape - N0l_ne_iob - N0l_ne_type - - N0w - N0W - N0p - N0c - N0c4 - N0c6 - N0L - N0_prefix - N0_suffix - N0_shape - N0_ne_iob - N0_ne_type - - N1w - N1W - N1p - N1c - N1c4 - N1c6 - N1L - N1_prefix - N1_suffix - N1_shape - N1_ne_iob - N1_ne_type - - N2w - N2W - N2p - N2c - N2c4 - N2c6 - N2L - N2_prefix - N2_suffix - N2_shape - N2_ne_iob - N2_ne_type - - P1w - P1W - P1p - P1c - P1c4 - P1c6 - P1L - P1_prefix - P1_suffix - P1_shape - P1_ne_iob - P1_ne_type - - P2w - P2W - P2p - P2c - P2c4 - P2c6 - P2L - P2_prefix - P2_suffix - P2_shape - P2_ne_iob - P2_ne_type - - E0w - E0W - E0p - E0c - E0c4 - E0c6 - E0L - E0_prefix - E0_suffix - E0_shape - E0_ne_iob - E0_ne_type - - E1w - E1W - E1p - E1c - E1c4 - E1c6 - E1L - E1_prefix - E1_suffix - E1_shape - E1_ne_iob - E1_ne_type - - # Misc features at the end - dist - N0lv - S0lv - S0rv - S1lv - S1rv - - S0_has_head - S1_has_head - S2_has_head - - CONTEXT_SIZE diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx deleted file mode 100644 index 2e0db4877..000000000 --- a/spacy/syntax/_parse_features.pyx +++ /dev/null @@ -1,419 +0,0 @@ -""" -Fill an array, context, with every _atomic_ value our features reference. -We then write the _actual features_ as tuples of the atoms. The machinery -that translates from the tuples to feature-extractors (which pick the values -out of "context") is in features/extractor.pyx - -The atomic feature names are listed in a big enum, so that the feature tuples -can refer to them. -""" -# coding: utf-8 -from __future__ import unicode_literals - -from libc.string cimport memset -from itertools import combinations -from cymem.cymem cimport Pool - -from ..structs cimport TokenC -from .stateclass cimport StateClass -from ._state cimport StateC - - -cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: - if token is NULL: - context[0] = 0 - context[1] = 0 - context[2] = 0 - context[3] = 0 - context[4] = 0 - context[5] = 0 - context[6] = 0 - context[7] = 0 - context[8] = 0 - context[9] = 0 - context[10] = 0 - context[11] = 0 - else: - context[0] = token.lex.orth - context[1] = token.lemma - context[2] = token.tag - context[3] = token.lex.cluster - # We've read in the string little-endian, so now we can take & (2**n)-1 - # to get the first n bits of the cluster. - # e.g. s = "1110010101" - # s = ''.join(reversed(s)) - # first_4_bits = int(s, 2) - # print first_4_bits - # 5 - # print "{0:b}".format(prefix).ljust(4, '0') - # 1110 - # What we're doing here is picking a number where all bits are 1, e.g. - # 15 is 1111, 63 is 111111 and doing bitwise AND, so getting all bits in - # the source that are set to 1. - context[4] = token.lex.cluster & 15 - context[5] = token.lex.cluster & 63 - context[6] = token.dep if token.head != 0 else 0 - context[7] = token.lex.prefix - context[8] = token.lex.suffix - context[9] = token.lex.shape - context[10] = token.ent_iob - context[11] = token.ent_type - -cdef int fill_context(atom_t* ctxt, const StateC* st) nogil: - # Take care to fill every element of context! - # We could memset, but this makes it very easy to have broken features that - # make almost no impact on accuracy. If instead they're unset, the impact - # tends to be dramatic, so we get an obvious regression to fix... - fill_token(&ctxt[S2w], st.S_(2)) - fill_token(&ctxt[S1w], st.S_(1)) - fill_token(&ctxt[S1rw], st.R_(st.S(1), 1)) - fill_token(&ctxt[S0lw], st.L_(st.S(0), 1)) - fill_token(&ctxt[S0l2w], st.L_(st.S(0), 2)) - fill_token(&ctxt[S0w], st.S_(0)) - fill_token(&ctxt[S0r2w], st.R_(st.S(0), 2)) - fill_token(&ctxt[S0rw], st.R_(st.S(0), 1)) - fill_token(&ctxt[N0lw], st.L_(st.B(0), 1)) - fill_token(&ctxt[N0l2w], st.L_(st.B(0), 2)) - fill_token(&ctxt[N0w], st.B_(0)) - fill_token(&ctxt[N1w], st.B_(1)) - fill_token(&ctxt[N2w], st.B_(2)) - fill_token(&ctxt[P1w], st.safe_get(st.B(0)-1)) - fill_token(&ctxt[P2w], st.safe_get(st.B(0)-2)) - - fill_token(&ctxt[E0w], st.E_(0)) - fill_token(&ctxt[E1w], st.E_(1)) - - if st.stack_depth() >= 1 and not st.eol(): - ctxt[dist] = min_(st.B(0) - st.E(0), 5) - else: - ctxt[dist] = 0 - ctxt[N0lv] = min_(st.n_L(st.B(0)), 5) - ctxt[S0lv] = min_(st.n_L(st.S(0)), 5) - ctxt[S0rv] = min_(st.n_R(st.S(0)), 5) - ctxt[S1lv] = min_(st.n_L(st.S(1)), 5) - ctxt[S1rv] = min_(st.n_R(st.S(1)), 5) - - ctxt[S0_has_head] = 0 - ctxt[S1_has_head] = 0 - ctxt[S2_has_head] = 0 - if st.stack_depth() >= 1: - ctxt[S0_has_head] = st.has_head(st.S(0)) + 1 - if st.stack_depth() >= 2: - ctxt[S1_has_head] = st.has_head(st.S(1)) + 1 - if st.stack_depth() >= 3: - ctxt[S2_has_head] = st.has_head(st.S(2)) + 1 - - -cdef inline int min_(int a, int b) nogil: - return a if a > b else b - - -ner = ( - (N0W,), - (P1W,), - (N1W,), - (P2W,), - (N2W,), - - (P1W, N0W,), - (N0W, N1W), - - (N0_prefix,), - (N0_suffix,), - - (P1_shape,), - (N0_shape,), - (N1_shape,), - (P1_shape, N0_shape,), - (N0_shape, P1_shape,), - (P1_shape, N0_shape, N1_shape), - (N2_shape,), - (P2_shape,), - - #(P2_norm, P1_norm, W_norm), - #(P1_norm, W_norm, N1_norm), - #(W_norm, N1_norm, N2_norm) - - (P2p,), - (P1p,), - (N0p,), - (N1p,), - (N2p,), - - (P1p, N0p), - (N0p, N1p), - (P2p, P1p, N0p), - (P1p, N0p, N1p), - (N0p, N1p, N2p), - - (P2c,), - (P1c,), - (N0c,), - (N1c,), - (N2c,), - - (P1c, N0c), - (N0c, N1c), - - (E0W,), - (E0c,), - (E0p,), - - (E0W, N0W), - (E0c, N0W), - (E0p, N0W), - - (E0p, P1p, N0p), - (E0c, P1c, N0c), - - (E0w, P1c), - (E0p, P1p), - (E0c, P1c), - (E0p, E1p), - (E0c, P1p), - - (E1W,), - (E1c,), - (E1p,), - - (E0W, E1W), - (E0W, E1p,), - (E0p, E1W,), - (E0p, E1W), - - (P1_ne_iob,), - (P1_ne_iob, P1_ne_type), - (N0w, P1_ne_iob, P1_ne_type), - - (N0_shape,), - (N1_shape,), - (N2_shape,), - (P1_shape,), - (P2_shape,), - - (N0_prefix,), - (N0_suffix,), - - (P1_ne_iob,), - (P2_ne_iob,), - (P1_ne_iob, P2_ne_iob), - (P1_ne_iob, P1_ne_type), - (P2_ne_iob, P2_ne_type), - (N0w, P1_ne_iob, P1_ne_type), - - (N0w, N1w), -) - - -unigrams = ( - (S2W, S2p), - (S2c6, S2p), - - (S1W, S1p), - (S1c6, S1p), - - (S0W, S0p), - (S0c6, S0p), - - (N0W, N0p), - (N0p,), - (N0c,), - (N0c6, N0p), - (N0L,), - - (N1W, N1p), - (N1c6, N1p), - - (N2W, N2p), - (N2c6, N2p), - - (S0r2W, S0r2p), - (S0r2c6, S0r2p), - (S0r2L,), - - (S0rW, S0rp), - (S0rc6, S0rp), - (S0rL,), - - (S0l2W, S0l2p), - (S0l2c6, S0l2p), - (S0l2L,), - - (S0lW, S0lp), - (S0lc6, S0lp), - (S0lL,), - - (N0l2W, N0l2p), - (N0l2c6, N0l2p), - (N0l2L,), - - (N0lW, N0lp), - (N0lc6, N0lp), - (N0lL,), -) - - -s0_n0 = ( - (S0W, S0p, N0W, N0p), - (S0c, S0p, N0c, N0p), - (S0c6, S0p, N0c6, N0p), - (S0c4, S0p, N0c4, N0p), - (S0p, N0p), - (S0W, N0p), - (S0p, N0W), - (S0W, N0c), - (S0c, N0W), - (S0p, N0c), - (S0c, N0p), - (S0W, S0rp, N0p), - (S0p, S0rp, N0p), - (S0p, N0lp, N0W), - (S0p, N0lp, N0p), - (S0L, N0p), - (S0p, S0rL, N0p), - (S0p, N0lL, N0p), - (S0p, S0rv, N0p), - (S0p, N0lv, N0p), - (S0c6, S0rL, S0r2L, N0p), - (S0p, N0lL, N0l2L, N0p), -) - - -s1_s0 = ( - (S1p, S0p), - (S1p, S0p, S0_has_head), - (S1W, S0p), - (S1W, S0p, S0_has_head), - (S1c, S0p), - (S1c, S0p, S0_has_head), - (S1p, S1rL, S0p), - (S1p, S1rL, S0p, S0_has_head), - (S1p, S0lL, S0p), - (S1p, S0lL, S0p, S0_has_head), - (S1p, S0lL, S0l2L, S0p), - (S1p, S0lL, S0l2L, S0p, S0_has_head), - (S1L, S0L, S0W), - (S1L, S0L, S0p), - (S1p, S1L, S0L, S0p), - (S1p, S0p), -) - - -s1_n0 = ( - (S1p, N0p), - (S1c, N0c), - (S1c, N0p), - (S1p, N0c), - (S1W, S1p, N0p), - (S1p, N0W, N0p), - (S1c6, S1p, N0c6, N0p), - (S1L, N0p), - (S1p, S1rL, N0p), - (S1p, S1rp, N0p), -) - - -s0_n1 = ( - (S0p, N1p), - (S0c, N1c), - (S0c, N1p), - (S0p, N1c), - (S0W, S0p, N1p), - (S0p, N1W, N1p), - (S0c6, S0p, N1c6, N1p), - (S0L, N1p), - (S0p, S0rL, N1p), -) - - -n0_n1 = ( - (N0W, N0p, N1W, N1p), - (N0W, N0p, N1p), - (N0p, N1W, N1p), - (N0c, N0p, N1c, N1p), - (N0c6, N0p, N1c6, N1p), - (N0c, N1c), - (N0p, N1c), -) - -tree_shape = ( - (dist,), - (S0p, S0_has_head, S1_has_head, S2_has_head), - (S0p, S0lv, S0rv), - (N0p, N0lv), -) - -trigrams = ( - (N0p, N1p, N2p), - (S0p, S0lp, S0l2p), - (S0p, S0rp, S0r2p), - (S0p, S1p, S2p), - (S1p, S0p, N0p), - (S0p, S0lp, N0p), - (S0p, N0p, N0lp), - (N0p, N0lp, N0l2p), - - (S0W, S0p, S0rL, S0r2L), - (S0p, S0rL, S0r2L), - - (S0W, S0p, S0lL, S0l2L), - (S0p, S0lL, S0l2L), - - (N0W, N0p, N0lL, N0l2L), - (N0p, N0lL, N0l2L), -) - - -words = ( - S2w, - S1w, - S1rw, - S0lw, - S0l2w, - S0w, - S0r2w, - S0rw, - N0lw, - N0l2w, - N0w, - N1w, - N2w, - P1w, - P2w -) - -tags = ( - S2p, - S1p, - S1rp, - S0lp, - S0l2p, - S0p, - S0r2p, - S0rp, - N0lp, - N0l2p, - N0p, - N1p, - N2p, - P1p, - P2p -) - -labels = ( - S2L, - S1L, - S1rL, - S0lL, - S0l2L, - S0L, - S0r2L, - S0rL, - N0lL, - N0l2L, - N0L, - N1L, - N2L, - P1L, - P2L -) diff --git a/spacy/syntax/beam_parser.pxd b/spacy/syntax/beam_parser.pxd deleted file mode 100644 index 35a60cbf3..000000000 --- a/spacy/syntax/beam_parser.pxd +++ /dev/null @@ -1,10 +0,0 @@ -from .parser cimport Parser -from ..structs cimport TokenC -from thinc.typedefs cimport weight_t - - -cdef class BeamParser(Parser): - cdef public int beam_width - cdef public weight_t beam_density - - cdef int _parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) except -1 diff --git a/spacy/syntax/beam_parser.pyx b/spacy/syntax/beam_parser.pyx deleted file mode 100644 index 68e9f27af..000000000 --- a/spacy/syntax/beam_parser.pyx +++ /dev/null @@ -1,239 +0,0 @@ -""" -MALT-style dependency parser -""" -# cython: profile=True -# cython: experimental_cpp_class_def=True -# cython: cdivision=True -# cython: infer_types=True -# coding: utf-8 - -from __future__ import unicode_literals, print_function -cimport cython - -from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF -from libc.stdint cimport uint32_t, uint64_t -from libc.string cimport memset, memcpy -from libc.stdlib cimport rand -from libc.math cimport log, exp, isnan, isinf -from cymem.cymem cimport Pool, Address -from murmurhash.mrmr cimport real_hash64 as hash64 -from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t -from thinc.linear.features cimport ConjunctionExtracter -from thinc.structs cimport FeatureC, ExampleC -from thinc.extra.search cimport Beam, MaxViolation -from thinc.extra.eg cimport Example -from thinc.extra.mb cimport Minibatch - -from ..structs cimport TokenC -from ..tokens.doc cimport Doc -from ..strings cimport StringStore -from .transition_system cimport TransitionSystem, Transition -from ..gold cimport GoldParse -from . import _parse_features -from ._parse_features cimport CONTEXT_SIZE -from ._parse_features cimport fill_context -from .stateclass cimport StateClass -from .parser cimport Parser - - -DEBUG = False -def set_debug(val): - global DEBUG - DEBUG = val - - -def get_templates(name): - pf = _parse_features - if name == 'ner': - return pf.ner - elif name == 'debug': - return pf.unigrams - else: - return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \ - pf.tree_shape + pf.trigrams) - - -cdef int BEAM_WIDTH = 16 -cdef weight_t BEAM_DENSITY = 0.001 - -cdef class BeamParser(Parser): - def __init__(self, *args, **kwargs): - self.beam_width = kwargs.get('beam_width', BEAM_WIDTH) - self.beam_density = kwargs.get('beam_density', BEAM_DENSITY) - Parser.__init__(self, *args, **kwargs) - - cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil: - with gil: - self._parseC(tokens, length, nr_feat, self.moves.n_moves) - - cdef int _parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) except -1: - cdef Beam beam = Beam(self.moves.n_moves, self.beam_width, min_density=self.beam_density) - # TODO: How do we handle new labels here? This increases nr_class - beam.initialize(self.moves.init_beam_state, length, tokens) - beam.check_done(_check_final_state, NULL) - if beam.is_done: - _cleanup(beam) - return 0 - while not beam.is_done: - self._advance_beam(beam, None, False) - state = beam.at(0) - self.moves.finalize_state(state.c) - for i in range(length): - tokens[i] = state.c._sent[i] - _cleanup(beam) - - def update(self, Doc tokens, GoldParse gold_parse, itn=0): - self.moves.preprocess_gold(gold_parse) - cdef Beam pred = Beam(self.moves.n_moves, self.beam_width) - pred.initialize(self.moves.init_beam_state, tokens.length, tokens.c) - pred.check_done(_check_final_state, NULL) - # Hack for NER - for i in range(pred.size): - stcls = pred.at(i) - self.moves.initialize_state(stcls.c) - - cdef Beam gold = Beam(self.moves.n_moves, self.beam_width, min_density=0.0) - gold.initialize(self.moves.init_beam_state, tokens.length, tokens.c) - gold.check_done(_check_final_state, NULL) - violn = MaxViolation() - while not pred.is_done and not gold.is_done: - # We search separately here, to allow for ambiguity in the gold parse. - self._advance_beam(pred, gold_parse, False) - self._advance_beam(gold, gold_parse, True) - violn.check_crf(pred, gold) - if pred.loss > 0 and pred.min_score > (gold.score + self.model.time): - break - else: - # The non-monotonic oracle makes it difficult to ensure final costs are - # correct. Therefore do final correction - for i in range(pred.size): - if self.moves.is_gold_parse(pred.at(i), gold_parse): - pred._states[i].loss = 0.0 - elif pred._states[i].loss == 0.0: - pred._states[i].loss = 1.0 - violn.check_crf(pred, gold) - if pred.size < 1: - raise Exception("No candidates", tokens.length) - if gold.size < 1: - raise Exception("No gold", tokens.length) - if pred.loss == 0: - self.model.update_from_histories(self.moves, tokens, [(0.0, [])]) - elif True: - #_check_train_integrity(pred, gold, gold_parse, self.moves) - histories = list(zip(violn.p_probs, violn.p_hist)) + \ - list(zip(violn.g_probs, violn.g_hist)) - self.model.update_from_histories(self.moves, tokens, histories, min_grad=0.001**(itn+1)) - else: - self.model.update_from_histories(self.moves, tokens, - [(1.0, violn.p_hist[0]), (-1.0, violn.g_hist[0])]) - _cleanup(pred) - _cleanup(gold) - return pred.loss - - def _advance_beam(self, Beam beam, GoldParse gold, bint follow_gold): - cdef atom_t[CONTEXT_SIZE] context - cdef Pool mem = Pool() - features = mem.alloc(self.model.nr_feat, sizeof(FeatureC)) - if False: - mb = Minibatch(self.model.widths, beam.size) - for i in range(beam.size): - stcls = beam.at(i) - if stcls.c.is_final(): - nr_feat = 0 - else: - nr_feat = self.model.set_featuresC(context, features, stcls.c) - self.moves.set_valid(beam.is_valid[i], stcls.c) - mb.c.push_back(features, nr_feat, beam.costs[i], beam.is_valid[i], 0) - self.model(mb) - for i in range(beam.size): - memcpy(beam.scores[i], mb.c.scores(i), mb.c.nr_out() * sizeof(beam.scores[i][0])) - else: - for i in range(beam.size): - stcls = beam.at(i) - if not stcls.is_final(): - nr_feat = self.model.set_featuresC(context, features, stcls.c) - self.moves.set_valid(beam.is_valid[i], stcls.c) - self.model.set_scoresC(beam.scores[i], features, nr_feat) - if gold is not None: - n_gold = 0 - lines = [] - for i in range(beam.size): - stcls = beam.at(i) - if not stcls.c.is_final(): - self.moves.set_costs(beam.is_valid[i], beam.costs[i], stcls, gold) - if follow_gold: - for j in range(self.moves.n_moves): - if beam.costs[i][j] >= 1: - beam.is_valid[i][j] = 0 - lines.append((stcls.B(0), stcls.B(1), - stcls.B_(0).ent_iob, stcls.B_(1).ent_iob, - stcls.B_(1).sent_start, - j, - beam.is_valid[i][j], 'set invalid', - beam.costs[i][j], self.moves.c[j].move, self.moves.c[j].label)) - n_gold += 1 if beam.is_valid[i][j] else 0 - if follow_gold and n_gold == 0: - raise Exception("No gold") - if follow_gold: - beam.advance(_transition_state, NULL, self.moves.c) - else: - beam.advance(_transition_state, _hash_state, self.moves.c) - beam.check_done(_check_final_state, NULL) - - -# These are passed as callbacks to thinc.search.Beam -cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: - dest = _dest - src = _src - moves = _moves - dest.clone(src) - moves[clas].do(dest.c, moves[clas].label) - - -cdef int _check_final_state(void* _state, void* extra_args) except -1: - return (_state).is_final() - - -def _cleanup(Beam beam): - for i in range(beam.width): - Py_XDECREF(beam._states[i].content) - Py_XDECREF(beam._parents[i].content) - - -cdef hash_t _hash_state(void* _state, void* _) except 0: - state = _state - if state.c.is_final(): - return 1 - else: - return state.c.hash() - - -def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, TransitionSystem moves): - for i in range(pred.size): - if not pred._states[i].is_done or pred._states[i].loss == 0: - continue - state = pred.at(i) - if moves.is_gold_parse(state, gold_parse) == True: - for dep in gold_parse.orig_annot: - print(dep[1], dep[3], dep[4]) - print("Cost", pred._states[i].loss) - for j in range(gold_parse.length): - print(gold_parse.orig_annot[j][1], state.H(j), moves.strings[state.safe_get(j).dep]) - acts = [moves.c[clas].move for clas in pred.histories[i]] - labels = [moves.c[clas].label for clas in pred.histories[i]] - print([moves.move_name(move, label) for move, label in zip(acts, labels)]) - raise Exception("Predicted state is gold-standard") - for i in range(gold.size): - if not gold._states[i].is_done: - continue - state = gold.at(i) - if moves.is_gold(state, gold_parse) == False: - print("Truth") - for dep in gold_parse.orig_annot: - print(dep[1], dep[3], dep[4]) - print("Predicted good") - for j in range(gold_parse.length): - print(gold_parse.orig_annot[j][1], state.H(j), moves.strings[state.safe_get(j).dep]) - raise Exception("Gold parse is not gold-standard") - - diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd deleted file mode 100644 index 95b6c3d3f..000000000 --- a/spacy/syntax/parser.pxd +++ /dev/null @@ -1,24 +0,0 @@ -from thinc.linear.avgtron cimport AveragedPerceptron -from thinc.typedefs cimport atom_t -from thinc.structs cimport FeatureC - -from .stateclass cimport StateClass -from .arc_eager cimport TransitionSystem -from ..vocab cimport Vocab -from ..tokens.doc cimport Doc -from ..structs cimport TokenC -from ._state cimport StateC - - -cdef class ParserModel(AveragedPerceptron): - cdef int set_featuresC(self, atom_t* context, FeatureC* features, - const StateC* state) nogil - - -cdef class Parser: - cdef readonly Vocab vocab - cdef readonly ParserModel model - cdef readonly TransitionSystem moves - cdef readonly object cfg - - cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx deleted file mode 100644 index 78698db12..000000000 --- a/spacy/syntax/parser.pyx +++ /dev/null @@ -1,526 +0,0 @@ -""" -MALT-style dependency parser -""" -# coding: utf-8 -# cython: infer_types=True -from __future__ import unicode_literals - -from collections import Counter -import ujson - -cimport cython -cimport cython.parallel - -import numpy.random - -from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF -from cpython.exc cimport PyErr_CheckSignals -from libc.stdint cimport uint32_t, uint64_t -from libc.string cimport memset, memcpy -from libc.stdlib cimport malloc, calloc, free -from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t -from thinc.linear.avgtron cimport AveragedPerceptron -from thinc.linalg cimport VecVec -from thinc.structs cimport SparseArrayC, FeatureC, ExampleC -from thinc.extra.eg cimport Example -from cymem.cymem cimport Pool, Address -from murmurhash.mrmr cimport hash64 -from preshed.maps cimport MapStruct -from preshed.maps cimport map_get - -from . import _parse_features -from ._parse_features cimport CONTEXT_SIZE -from ._parse_features cimport fill_context -from .stateclass cimport StateClass -from ._state cimport StateC -from .transition_system import OracleError -from .transition_system cimport TransitionSystem, Transition -from ..structs cimport TokenC -from ..tokens.doc cimport Doc -from ..strings cimport StringStore -from ..gold cimport GoldParse - - -USE_FTRL = True -DEBUG = False -def set_debug(val): - global DEBUG - DEBUG = val - - -def get_templates(name): - pf = _parse_features - if name == 'ner': - return pf.ner - elif name == 'debug': - return pf.unigrams - elif name.startswith('embed'): - return (pf.words, pf.tags, pf.labels) - else: - return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \ - pf.tree_shape + pf.trigrams) - - -cdef class ParserModel(AveragedPerceptron): - cdef int set_featuresC(self, atom_t* context, FeatureC* features, - const StateC* state) nogil: - fill_context(context, state) - nr_feat = self.extracter.set_features(features, context) - return nr_feat - - def update(self, Example eg, itn=0): - """ - Does regression on negative cost. Sort of cute? - """ - self.time += 1 - cdef int best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class) - cdef int guess = eg.guess - if guess == best or best == -1: - return 0.0 - cdef FeatureC feat - cdef int clas - cdef weight_t gradient - if USE_FTRL: - for feat in eg.c.features[:eg.c.nr_feat]: - for clas in range(eg.c.nr_class): - if eg.c.is_valid[clas] and eg.c.scores[clas] >= eg.c.scores[best]: - gradient = eg.c.scores[clas] + eg.c.costs[clas] - self.update_weight_ftrl(feat.key, clas, feat.value * gradient) - else: - for feat in eg.c.features[:eg.c.nr_feat]: - self.update_weight(feat.key, guess, feat.value * eg.c.costs[guess]) - self.update_weight(feat.key, best, -feat.value * eg.c.costs[guess]) - return eg.c.costs[guess] - - def update_from_histories(self, TransitionSystem moves, Doc doc, histories, weight_t min_grad=0.0): - cdef Pool mem = Pool() - features = mem.alloc(self.nr_feat, sizeof(FeatureC)) - - cdef StateClass stcls - - cdef class_t clas - self.time += 1 - cdef atom_t[CONTEXT_SIZE] atoms - histories = [(grad, hist) for grad, hist in histories if abs(grad) >= min_grad and hist] - if not histories: - return None - gradient = [Counter() for _ in range(max([max(h)+1 for _, h in histories]))] - for d_loss, history in histories: - stcls = StateClass.init(doc.c, doc.length) - moves.initialize_state(stcls.c) - for clas in history: - nr_feat = self.set_featuresC(atoms, features, stcls.c) - clas_grad = gradient[clas] - for feat in features[:nr_feat]: - clas_grad[feat.key] += d_loss * feat.value - moves.c[clas].do(stcls.c, moves.c[clas].label) - cdef feat_t key - cdef weight_t d_feat - for clas, clas_grad in enumerate(gradient): - for key, d_feat in clas_grad.items(): - if d_feat != 0: - self.update_weight_ftrl(key, clas, d_feat) - - -cdef class Parser: - """ - Base class of the DependencyParser and EntityRecognizer. - """ - @classmethod - def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg): - """ - Load the statistical model from the supplied path. - - Arguments: - path (Path): - The path to load from. - vocab (Vocab): - The vocabulary. Must be shared by the documents to be processed. - require (bool): - Whether to raise an error if the files are not found. - Returns (Parser): - The newly constructed object. - """ - with (path / 'config.json').open() as file_: - cfg = ujson.load(file_) - # TODO: remove this shim when we don't have to support older data - if 'labels' in cfg and 'actions' not in cfg: - cfg['actions'] = cfg.pop('labels') - # TODO: remove this shim when we don't have to support older data - for action_name, labels in dict(cfg.get('actions', {})).items(): - # We need this to be sorted - if isinstance(labels, dict): - labels = list(sorted(labels.keys())) - cfg['actions'][action_name] = labels - self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg) - if (path / 'model').exists(): - self.model.load(str(path / 'model')) - elif require: - raise IOError( - "Required file %s/model not found when loading" % str(path)) - return self - - def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg): - """ - Create a Parser. - - Arguments: - vocab (Vocab): - The vocabulary object. Must be shared with documents to be processed. - model (thinc.linear.AveragedPerceptron): - The statistical model. - Returns (Parser): - The newly constructed object. - """ - if TransitionSystem is None: - TransitionSystem = self.TransitionSystem - self.vocab = vocab - cfg['actions'] = TransitionSystem.get_actions(**cfg) - self.moves = TransitionSystem(vocab.strings, cfg['actions']) - # TODO: Remove this when we no longer need to support old-style models - if isinstance(cfg.get('features'), basestring): - cfg['features'] = get_templates(cfg['features']) - elif 'features' not in cfg: - cfg['features'] = self.feature_templates - - self.model = ParserModel(cfg['features']) - self.model.l1_penalty = cfg.get('L1', 0.0) - self.model.learn_rate = cfg.get('learn_rate', 0.001) - - self.cfg = cfg - # TODO: This is a pretty hacky fix to the problem of adding more - # labels. The issue is they come in out of order, if labels are - # added during training - for label in cfg.get('extra_labels', []): - self.add_label(label) - - def __reduce__(self): - return (Parser, (self.vocab, self.moves, self.model), None, None) - - def __call__(self, Doc tokens): - """ - Apply the entity recognizer, setting the annotations onto the Doc object. - - Arguments: - doc (Doc): The document to be processed. - Returns: - None - """ - cdef int nr_feat = self.model.nr_feat - with nogil: - status = self.parseC(tokens.c, tokens.length, nr_feat) - # Check for KeyboardInterrupt etc. Untested - PyErr_CheckSignals() - if status != 0: - raise ParserStateError(tokens) - self.moves.finalize_doc(tokens) - - def pipe(self, stream, int batch_size=1000, int n_threads=2): - """ - Process a stream of documents. - - Arguments: - stream: The sequence of documents to process. - batch_size (int): - The number of documents to accumulate into a working set. - n_threads (int): - The number of threads with which to work on the buffer in parallel. - Yields (Doc): Documents, in order. - """ - cdef Pool mem = Pool() - cdef TokenC** doc_ptr = mem.alloc(batch_size, sizeof(TokenC*)) - cdef int* lengths = mem.alloc(batch_size, sizeof(int)) - cdef Doc doc - cdef int i - cdef int nr_feat = self.model.nr_feat - cdef int status - queue = [] - for doc in stream: - doc_ptr[len(queue)] = doc.c - lengths[len(queue)] = doc.length - queue.append(doc) - if len(queue) == batch_size: - with nogil: - for i in cython.parallel.prange(batch_size, num_threads=n_threads): - status = self.parseC(doc_ptr[i], lengths[i], nr_feat) - if status != 0: - with gil: - raise ParserStateError(queue[i]) - PyErr_CheckSignals() - for doc in queue: - self.moves.finalize_doc(doc) - yield doc - queue = [] - batch_size = len(queue) - with nogil: - for i in cython.parallel.prange(batch_size, num_threads=n_threads): - status = self.parseC(doc_ptr[i], lengths[i], nr_feat) - if status != 0: - with gil: - raise ParserStateError(queue[i]) - PyErr_CheckSignals() - for doc in queue: - self.moves.finalize_doc(doc) - yield doc - - cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil: - state = new StateC(tokens, length) - # NB: This can change self.moves.n_moves! - # I think this causes memory errors if called by .pipe() - self.moves.initialize_state(state) - nr_class = self.moves.n_moves - - cdef ExampleC eg - eg.nr_feat = nr_feat - eg.nr_atom = CONTEXT_SIZE - eg.nr_class = nr_class - eg.features = calloc(sizeof(FeatureC), nr_feat) - eg.atoms = calloc(sizeof(atom_t), CONTEXT_SIZE) - eg.scores = calloc(sizeof(weight_t), nr_class) - eg.is_valid = calloc(sizeof(int), nr_class) - cdef int i - while not state.is_final(): - eg.nr_feat = self.model.set_featuresC(eg.atoms, eg.features, state) - self.moves.set_valid(eg.is_valid, state) - self.model.set_scoresC(eg.scores, eg.features, eg.nr_feat) - - guess = VecVec.arg_max_if_true(eg.scores, eg.is_valid, eg.nr_class) - if guess < 0: - return 1 - - action = self.moves.c[guess] - - action.do(state, action.label) - memset(eg.scores, 0, sizeof(eg.scores[0]) * eg.nr_class) - for i in range(eg.nr_class): - eg.is_valid[i] = 1 - self.moves.finalize_state(state) - for i in range(length): - tokens[i] = state._sent[i] - del state - free(eg.features) - free(eg.atoms) - free(eg.scores) - free(eg.is_valid) - return 0 - - def update(self, Doc tokens, GoldParse gold, itn=0, double drop=0.0): - """ - Update the statistical model. - - Arguments: - doc (Doc): - The example document for the update. - gold (GoldParse): - The gold-standard annotations, to calculate the loss. - Returns (float): - The loss on this example. - """ - self.moves.preprocess_gold(gold) - cdef StateClass stcls = StateClass.init(tokens.c, tokens.length) - self.moves.initialize_state(stcls.c) - cdef Pool mem = Pool() - cdef Example eg = Example( - nr_class=self.moves.n_moves, - nr_atom=CONTEXT_SIZE, - nr_feat=self.model.nr_feat) - cdef weight_t loss = 0 - cdef Transition action - cdef double dropout_rate = self.cfg.get('dropout', drop) - while not stcls.is_final(): - eg.c.nr_feat = self.model.set_featuresC(eg.c.atoms, eg.c.features, - stcls.c) - dropout(eg.c.features, eg.c.nr_feat, dropout_rate) - self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold) - self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat) - guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) - self.model.update(eg) - - action = self.moves.c[guess] - action.do(stcls.c, action.label) - loss += eg.costs[guess] - eg.fill_scores(0, eg.c.nr_class) - eg.fill_costs(0, eg.c.nr_class) - eg.fill_is_valid(1, eg.c.nr_class) - - self.moves.finalize_state(stcls.c) - return loss - - def step_through(self, Doc doc, GoldParse gold=None): - """ - Set up a stepwise state, to introspect and control the transition sequence. - - Arguments: - doc (Doc): The document to step through. - gold (GoldParse): Optional gold parse - Returns (StepwiseState): - A state object, to step through the annotation process. - """ - return StepwiseState(self, doc, gold=gold) - - def from_transition_sequence(self, Doc doc, sequence): - """Control the annotations on a document by specifying a transition sequence - to follow. - - Arguments: - doc (Doc): The document to annotate. - sequence: A sequence of action names, as unicode strings. - Returns: None - """ - with self.step_through(doc) as stepwise: - for transition in sequence: - stepwise.transition(transition) - - def add_label(self, label): - # Doesn't set label into serializer -- subclasses override it to do that. - for action in self.moves.action_types: - added = self.moves.add_action(action, label) - if added: - # Important that the labels be stored as a list! We need the - # order, or the model goes out of synch - self.cfg.setdefault('extra_labels', []).append(label) - - -cdef int dropout(FeatureC* feats, int nr_feat, float prob) except -1: - if prob <= 0 or prob >= 1.: - return 0 - cdef double[::1] py_probs = numpy.random.uniform(0., 1., nr_feat) - cdef double* probs = &py_probs[0] - for i in range(nr_feat): - if probs[i] >= prob: - feats[i].value /= prob - else: - feats[i].value = 0. - - -cdef class StepwiseState: - cdef readonly StateClass stcls - cdef readonly Example eg - cdef readonly Doc doc - cdef readonly GoldParse gold - cdef readonly Parser parser - - def __init__(self, Parser parser, Doc doc, GoldParse gold=None): - self.parser = parser - self.doc = doc - if gold is not None: - self.gold = gold - self.parser.moves.preprocess_gold(self.gold) - else: - self.gold = GoldParse(doc) - self.stcls = StateClass.init(doc.c, doc.length) - self.parser.moves.initialize_state(self.stcls.c) - self.eg = Example( - nr_class=self.parser.moves.n_moves, - nr_atom=CONTEXT_SIZE, - nr_feat=self.parser.model.nr_feat) - - def __enter__(self): - return self - - def __exit__(self, type, value, traceback): - self.finish() - - @property - def is_final(self): - return self.stcls.is_final() - - @property - def stack(self): - return self.stcls.stack - - @property - def queue(self): - return self.stcls.queue - - @property - def heads(self): - return [self.stcls.H(i) for i in range(self.stcls.c.length)] - - @property - def deps(self): - return [self.doc.vocab.strings[self.stcls.c._sent[i].dep] - for i in range(self.stcls.c.length)] - - @property - def costs(self): - """ - Find the action-costs for the current state. - """ - if not self.gold: - raise ValueError("Can't set costs: No GoldParse provided") - self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs, - self.stcls, self.gold) - costs = {} - for i in range(self.parser.moves.n_moves): - if not self.eg.c.is_valid[i]: - continue - transition = self.parser.moves.c[i] - name = self.parser.moves.move_name(transition.move, transition.label) - costs[name] = self.eg.c.costs[i] - return costs - - def predict(self): - self.eg.reset() - self.eg.c.nr_feat = self.parser.model.set_featuresC(self.eg.c.atoms, self.eg.c.features, - self.stcls.c) - self.parser.moves.set_valid(self.eg.c.is_valid, self.stcls.c) - self.parser.model.set_scoresC(self.eg.c.scores, - self.eg.c.features, self.eg.c.nr_feat) - - cdef Transition action = self.parser.moves.c[self.eg.guess] - return self.parser.moves.move_name(action.move, action.label) - - def transition(self, action_name=None): - if action_name is None: - action_name = self.predict() - moves = {'S': 0, 'D': 1, 'L': 2, 'R': 3} - if action_name == '_': - action_name = self.predict() - action = self.parser.moves.lookup_transition(action_name) - elif action_name == 'L' or action_name == 'R': - self.predict() - move = moves[action_name] - clas = _arg_max_clas(self.eg.c.scores, move, self.parser.moves.c, - self.eg.c.nr_class) - action = self.parser.moves.c[clas] - else: - action = self.parser.moves.lookup_transition(action_name) - action.do(self.stcls.c, action.label) - - def finish(self): - if self.stcls.is_final(): - self.parser.moves.finalize_state(self.stcls.c) - self.doc.set_parse(self.stcls.c._sent) - self.parser.moves.finalize_doc(self.doc) - - -class ParserStateError(ValueError): - def __init__(self, doc): - ValueError.__init__(self, - "Error analysing doc -- no valid actions available. This should " - "never happen, so please report the error on the issue tracker. " - "Here's the thread to do so --- reopen it if it's closed:\n" - "https://github.com/spacy-io/spaCy/issues/429\n" - "Please include the text that the parser failed on, which is:\n" - "%s" % repr(doc.text)) - -cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs, int n) nogil: - cdef int best = -1 - for i in range(n): - if costs[i] <= 0: - if best == -1 or scores[i] > scores[best]: - best = i - return best - - -cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions, - int nr_class) except -1: - cdef weight_t score = 0 - cdef int mode = -1 - cdef int i - for i in range(nr_class): - if actions[i].move == move and (mode == -1 or scores[i] >= score): - mode = i - score = scores[i] - return mode From 90d1d9b230522124eaefba5172ac28b5b708a215 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 26 Oct 2017 13:22:45 +0200 Subject: [PATCH 4/6] Remove obsolete parser code --- setup.py | 5 ----- spacy/language.py | 1 - spacy/syntax/nn_parser.pyx | 3 --- 3 files changed, 9 deletions(-) diff --git a/setup.py b/setup.py index 2e2b816b7..f7525a3ff 100755 --- a/setup.py +++ b/setup.py @@ -30,19 +30,14 @@ MOD_NAMES = [ 'spacy.syntax._state', 'spacy.syntax._beam_utils', 'spacy.tokenizer', - 'spacy._cfile', - 'spacy.syntax.parser', 'spacy.syntax.nn_parser', - 'spacy.syntax.beam_parser', 'spacy.syntax.nonproj', 'spacy.syntax.transition_system', 'spacy.syntax.arc_eager', - 'spacy.syntax._parse_features', 'spacy.gold', 'spacy.tokens.doc', 'spacy.tokens.span', 'spacy.tokens.token', - 'spacy.cfile', 'spacy.matcher', 'spacy.syntax.ner', 'spacy.symbols', diff --git a/spacy/language.py b/spacy/language.py index c4777898e..34bc49263 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -16,7 +16,6 @@ from .tokenizer import Tokenizer from .vocab import Vocab from .tagger import Tagger from .lemmatizer import Lemmatizer -from .syntax.parser import get_templates from .pipeline import DependencyParser, Tensorizer, Tagger from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 913d2365f..c592cdc22 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -54,9 +54,6 @@ from .._ml import link_vectors_to_models from .._ml import HistoryFeatures from ..compat import json_dumps, copy_array -from . import _parse_features -from ._parse_features cimport CONTEXT_SIZE -from ._parse_features cimport fill_context from .stateclass cimport StateClass from ._state cimport StateC from . import nonproj From ea03f1ef6431791700aa8458d720de94a31cb68b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 26 Oct 2017 13:23:36 +0200 Subject: [PATCH 5/6] Remove obsolete cfile code --- spacy/_cfile.pxd | 26 ------------ spacy/_cfile.pyx | 88 ---------------------------------------- spacy/cfile.pxd | 33 --------------- spacy/cfile.pyx | 103 ----------------------------------------------- 4 files changed, 250 deletions(-) delete mode 100644 spacy/_cfile.pxd delete mode 100644 spacy/_cfile.pyx delete mode 100644 spacy/cfile.pxd delete mode 100644 spacy/cfile.pyx diff --git a/spacy/_cfile.pxd b/spacy/_cfile.pxd deleted file mode 100644 index cb0077587..000000000 --- a/spacy/_cfile.pxd +++ /dev/null @@ -1,26 +0,0 @@ -from libc.stdio cimport fopen, fclose, fread, fwrite, FILE -from cymem.cymem cimport Pool - -cdef class CFile: - cdef FILE* fp - cdef bint is_open - cdef Pool mem - cdef int size # For compatibility with subclass - cdef int _capacity # For compatibility with subclass - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * - - - -cdef class StringCFile(CFile): - cdef unsigned char* data - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * diff --git a/spacy/_cfile.pyx b/spacy/_cfile.pyx deleted file mode 100644 index ceebe2e59..000000000 --- a/spacy/_cfile.pyx +++ /dev/null @@ -1,88 +0,0 @@ -from libc.stdio cimport fopen, fclose, fread, fwrite, FILE -from libc.string cimport memcpy - - -cdef class CFile: - def __init__(self, loc, mode, on_open_error=None): - if isinstance(mode, unicode): - mode_str = mode.encode('ascii') - else: - mode_str = mode - if hasattr(loc, 'as_posix'): - loc = loc.as_posix() - self.mem = Pool() - cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc - self.fp = fopen(bytes_loc, mode_str) - if self.fp == NULL: - if on_open_error is not None: - on_open_error() - else: - raise IOError("Could not open binary file %s" % bytes_loc) - self.is_open = True - - def __dealloc__(self): - if self.is_open: - fclose(self.fp) - - def close(self): - fclose(self.fp) - self.is_open = False - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: - st = fread(dest, elem_size, number, self.fp) - if st != number: - raise IOError - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1: - st = fwrite(src, elem_size, number, self.fp) - if st != number: - raise IOError - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: - cdef void* dest = mem.alloc(number, elem_size) - self.read_into(dest, number, elem_size) - return dest - - def write_unicode(self, unicode value): - cdef bytes py_bytes = value.encode('utf8') - cdef char* chars = py_bytes - self.write(sizeof(char), len(py_bytes), chars) - - -cdef class StringCFile: - def __init__(self, mode, bytes data=b'', on_open_error=None): - self.mem = Pool() - self.is_open = 'w' in mode - self._capacity = max(len(data), 8) - self.size = len(data) - self.data = self.mem.alloc(1, self._capacity) - for i in range(len(data)): - self.data[i] = data[i] - - def close(self): - self.is_open = False - - def string_data(self): - return (self.data-self.size)[:self.size] - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: - memcpy(dest, self.data, elem_size * number) - self.data += elem_size * number - - cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1: - write_size = number * elem_size - if (self.size + write_size) >= self._capacity: - self._capacity = (self.size + write_size) * 2 - self.data = self.mem.realloc(self.data, self._capacity) - memcpy(&self.data[self.size], src, elem_size * number) - self.size += write_size - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: - cdef void* dest = mem.alloc(number, elem_size) - self.read_into(dest, number, elem_size) - return dest - - def write_unicode(self, unicode value): - cdef bytes py_bytes = value.encode('utf8') - cdef char* chars = py_bytes - self.write(sizeof(char), len(py_bytes), chars) diff --git a/spacy/cfile.pxd b/spacy/cfile.pxd deleted file mode 100644 index b95fbb2be..000000000 --- a/spacy/cfile.pxd +++ /dev/null @@ -1,33 +0,0 @@ -from libc.stdio cimport fopen, fclose, fread, fwrite, FILE -from cymem.cymem cimport Pool - -cdef class CFile: - cdef FILE* fp - cdef unsigned char* data - cdef int is_open - cdef Pool mem - cdef int size # For compatibility with subclass - cdef int i # For compatibility with subclass - cdef int _capacity # For compatibility with subclass - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * - - - -cdef class StringCFile: - cdef unsigned char* data - cdef int is_open - cdef Pool mem - cdef int size # For compatibility with subclass - cdef int i # For compatibility with subclass - cdef int _capacity # For compatibility with subclass - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * diff --git a/spacy/cfile.pyx b/spacy/cfile.pyx deleted file mode 100644 index 006ff78ac..000000000 --- a/spacy/cfile.pyx +++ /dev/null @@ -1,103 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from libc.stdio cimport fopen, fclose, fread, fwrite -from libc.string cimport memcpy - - -cdef class CFile: - def __init__(self, loc, mode, on_open_error=None): - if isinstance(mode, unicode): - mode_str = mode.encode('ascii') - else: - mode_str = mode - if hasattr(loc, 'as_posix'): - loc = loc.as_posix() - self.mem = Pool() - cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc - self.fp = fopen(bytes_loc, mode_str) - if self.fp == NULL: - if on_open_error is not None: - on_open_error() - else: - raise IOError("Could not open binary file %s" % bytes_loc) - self.is_open = True - - def __dealloc__(self): - if self.is_open: - fclose(self.fp) - - def close(self): - fclose(self.fp) - self.is_open = False - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: - st = fread(dest, elem_size, number, self.fp) - if st != number: - raise IOError - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1: - st = fwrite(src, elem_size, number, self.fp) - if st != number: - raise IOError - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: - cdef void* dest = mem.alloc(number, elem_size) - self.read_into(dest, number, elem_size) - return dest - - def write_unicode(self, unicode value): - cdef bytes py_bytes = value.encode('utf8') - cdef char* chars = py_bytes - self.write(sizeof(char), len(py_bytes), chars) - - -cdef class StringCFile: - def __init__(self, bytes data, mode, on_open_error=None): - self.mem = Pool() - self.is_open = 1 if 'w' in mode else 0 - self._capacity = max(len(data), 8) - self.size = len(data) - self.i = 0 - self.data = self.mem.alloc(1, self._capacity) - for i in range(len(data)): - self.data[i] = data[i] - - def __dealloc__(self): - # Important to override this -- or - # we try to close a non-existant file pointer! - pass - - def close(self): - self.is_open = False - - def string_data(self): - cdef bytes byte_string = b'\0' * (self.size) - bytes_ptr = byte_string - for i in range(self.size): - bytes_ptr[i] = self.data[i] - print(byte_string) - return byte_string - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: - if self.i+(number * elem_size) < self.size: - memcpy(dest, &self.data[self.i], elem_size * number) - self.i += elem_size * number - - cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1: - write_size = number * elem_size - if (self.size + write_size) >= self._capacity: - self._capacity = (self.size + write_size) * 2 - self.data = self.mem.realloc(self.data, self._capacity) - memcpy(&self.data[self.size], src, write_size) - self.size += write_size - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: - cdef void* dest = mem.alloc(number, elem_size) - self.read_into(dest, number, elem_size) - return dest - - def write_unicode(self, unicode value): - cdef bytes py_bytes = value.encode('utf8') - cdef char* chars = py_bytes - self.write(sizeof(char), len(py_bytes), chars) From c52671420c7b2554274009faa976a2788dc16d13 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 26 Oct 2017 13:28:19 +0200 Subject: [PATCH 6/6] Remove old cfile import --- spacy/vocab.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index bcd1f3c10..193509771 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -16,7 +16,6 @@ from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme from .strings cimport hash_string from .typedefs cimport attr_t -from .cfile cimport CFile from .tokens.token cimport Token from .attrs cimport PROB, LANG from .structs cimport SerializedLexemeC