Fix names of pipeline components

NeuralDependencyParser --> DependencyParser
NeuralEntityRecognizer --> EntityRecognizer
TokenVectorEncoder     --> Tensorizer
NeuralLabeller         --> MultitaskObjective
This commit is contained in:
Matthew Honnibal 2017-10-26 12:38:23 +02:00
parent b6b4f1aaf7
commit b0f3ea2200
11 changed files with 35 additions and 112 deletions

View File

@ -18,8 +18,8 @@ from .tagger import Tagger
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .syntax.parser import get_templates from .syntax.parser import get_templates
from .pipeline import NeuralDependencyParser, TokenVectorEncoder, NeuralTagger from .pipeline import DependencyParser, Tensorizer, Tagger
from .pipeline import NeuralEntityRecognizer, SimilarityHook, TextCategorizer from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer
from .compat import json_dumps, izip, copy_reg from .compat import json_dumps, izip, copy_reg
from .scorer import Scorer from .scorer import Scorer
@ -75,9 +75,6 @@ class BaseDefaults(object):
infixes = tuple(TOKENIZER_INFIXES) infixes = tuple(TOKENIZER_INFIXES)
tag_map = dict(TAG_MAP) tag_map = dict(TAG_MAP)
tokenizer_exceptions = {} tokenizer_exceptions = {}
parser_features = get_templates('parser')
entity_features = get_templates('ner')
tagger_features = Tagger.feature_templates # TODO -- fix this
stop_words = set() stop_words = set()
lemma_rules = {} lemma_rules = {}
lemma_exc = {} lemma_exc = {}
@ -102,9 +99,9 @@ class Language(object):
factories = { factories = {
'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp), 'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg), 'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg), 'tagger': lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg), 'parser': lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg), 'ner': lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), 'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg) 'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg)
} }

View File

@ -1,21 +0,0 @@
from .syntax.parser cimport Parser
#from .syntax.beam_parser cimport BeamParser
from .syntax.ner cimport BiluoPushDown
from .syntax.arc_eager cimport ArcEager
from .tagger cimport Tagger
cdef class EntityRecognizer(Parser):
pass
cdef class DependencyParser(Parser):
pass
#cdef class BeamEntityRecognizer(BeamParser):
# pass
#
#
#cdef class BeamDependencyParser(BeamParser):
# pass

View File

@ -26,11 +26,8 @@ from thinc.neural.util import to_categorical
from thinc.neural._classes.difference import Siamese, CauchySimilarity from thinc.neural._classes.difference import Siamese, CauchySimilarity
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .syntax.parser cimport Parser as LinearParser from .syntax.nn_parser cimport Parser
from .syntax.nn_parser cimport Parser as NeuralParser
from .syntax import nonproj from .syntax import nonproj
from .syntax.parser import get_templates as get_feature_templates
from .syntax.beam_parser cimport BeamParser
from .syntax.ner cimport BiluoPushDown from .syntax.ner cimport BiluoPushDown
from .syntax.arc_eager cimport ArcEager from .syntax.arc_eager cimport ArcEager
from .tagger import Tagger from .tagger import Tagger
@ -217,7 +214,7 @@ def _load_cfg(path):
return {} return {}
class TokenVectorEncoder(BaseThincComponent): class Tensorizer(BaseThincComponent):
"""Assign position-sensitive vectors to tokens, using a CNN or RNN.""" """Assign position-sensitive vectors to tokens, using a CNN or RNN."""
name = 'tensorizer' name = 'tensorizer'
@ -329,7 +326,7 @@ class TokenVectorEncoder(BaseThincComponent):
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
class NeuralTagger(BaseThincComponent): class Tagger(BaseThincComponent):
name = 'tagger' name = 'tagger'
def __init__(self, vocab, model=True, **cfg): def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab self.vocab = vocab
@ -513,7 +510,11 @@ class NeuralTagger(BaseThincComponent):
return self return self
class NeuralLabeller(NeuralTagger): class MultitaskObjective(Tagger):
'''Assist training of a parser or tagger, by training a side-objective.
Experimental
'''
name = 'nn_labeller' name = 'nn_labeller'
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg): def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
self.vocab = vocab self.vocab = vocab
@ -532,7 +533,7 @@ class NeuralLabeller(NeuralTagger):
self.make_label = target self.make_label = target
else: else:
raise ValueError( raise ValueError(
"NeuralLabeller target should be function or one of " "MultitaskObjective target should be function or one of "
"['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']") "['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.cfg.setdefault('cnn_maxout_pieces', 2) self.cfg.setdefault('cnn_maxout_pieces', 2)
@ -752,45 +753,7 @@ class TextCategorizer(BaseThincComponent):
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
cdef class EntityRecognizer(LinearParser): cdef class DependencyParser(Parser):
"""Annotate named entities on Doc objects."""
TransitionSystem = BiluoPushDown
feature_templates = get_feature_templates('ner')
def add_label(self, label):
LinearParser.add_label(self, label)
if isinstance(label, basestring):
label = self.vocab.strings[label]
cdef class BeamEntityRecognizer(BeamParser):
"""Annotate named entities on Doc objects."""
TransitionSystem = BiluoPushDown
feature_templates = get_feature_templates('ner')
def add_label(self, label):
LinearParser.add_label(self, label)
if isinstance(label, basestring):
label = self.vocab.strings[label]
cdef class DependencyParser(LinearParser):
TransitionSystem = ArcEager
feature_templates = get_feature_templates('basic')
def add_label(self, label):
LinearParser.add_label(self, label)
if isinstance(label, basestring):
label = self.vocab.strings[label]
@property
def postprocesses(self):
return [nonproj.deprojectivize]
cdef class NeuralDependencyParser(NeuralParser):
name = 'parser' name = 'parser'
TransitionSystem = ArcEager TransitionSystem = ArcEager
@ -800,17 +763,17 @@ cdef class NeuralDependencyParser(NeuralParser):
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
for target in []: for target in []:
labeller = NeuralLabeller(self.vocab, target=target) labeller = MultitaskObjective(self.vocab, target=target)
tok2vec = self.model[0] tok2vec = self.model[0]
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec) labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
pipeline.append(labeller) pipeline.append(labeller)
self._multitasks.append(labeller) self._multitasks.append(labeller)
def __reduce__(self): def __reduce__(self):
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None) return (DependencyParser, (self.vocab, self.moves, self.model), None, None)
cdef class NeuralEntityRecognizer(NeuralParser): cdef class EntityRecognizer(Parser):
name = 'ner' name = 'ner'
TransitionSystem = BiluoPushDown TransitionSystem = BiluoPushDown
@ -818,31 +781,14 @@ cdef class NeuralEntityRecognizer(NeuralParser):
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
for target in []: for target in []:
labeller = NeuralLabeller(self.vocab, target=target) labeller = MultitaskObjective(self.vocab, target=target)
tok2vec = self.model[0] tok2vec = self.model[0]
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec) labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
pipeline.append(labeller) pipeline.append(labeller)
self._multitasks.append(labeller) self._multitasks.append(labeller)
def __reduce__(self): def __reduce__(self):
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None) return (EntityRecognizer, (self.vocab, self.moves, self.model), None, None)
cdef class BeamDependencyParser(BeamParser): __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer']
TransitionSystem = ArcEager
feature_templates = get_feature_templates('basic')
def add_label(self, label):
Parser.add_label(self, label)
if isinstance(label, basestring):
label = self.vocab.strings[label]
@property
def postprocesses(self):
return [nonproj.deprojectivize]
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser',
'BeamEntityRecognizer', 'TokenVectorEnoder']

View File

@ -10,7 +10,8 @@ import pytest
def test_doc_add_entities_set_ents_iob(en_vocab): def test_doc_add_entities_set_ents_iob(en_vocab):
text = ["This", "is", "a", "lion"] text = ["This", "is", "a", "lion"]
doc = get_doc(en_vocab, text) doc = get_doc(en_vocab, text)
ner = EntityRecognizer(en_vocab, features=[(2,), (3,)]) ner = EntityRecognizer(en_vocab)
ner.begin_training([])
ner(doc) ner(doc)
assert len(list(doc.ents)) == 0 assert len(list(doc.ents)) == 0

View File

@ -9,7 +9,7 @@ from ...attrs import NORM
from ...gold import GoldParse from ...gold import GoldParse
from ...vocab import Vocab from ...vocab import Vocab
from ...tokens import Doc from ...tokens import Doc
from ...pipeline import NeuralDependencyParser from ...pipeline import DependencyParser
numpy.random.seed(0) numpy.random.seed(0)
@ -21,7 +21,7 @@ def vocab():
@pytest.fixture @pytest.fixture
def parser(vocab): def parser(vocab):
parser = NeuralDependencyParser(vocab) parser = DependencyParser(vocab)
parser.cfg['token_vector_width'] = 8 parser.cfg['token_vector_width'] = 8
parser.cfg['hidden_width'] = 30 parser.cfg['hidden_width'] = 30
parser.cfg['hist_size'] = 0 parser.cfg['hist_size'] = 0

View File

@ -6,7 +6,7 @@ import numpy
from ..._ml import chain, Tok2Vec, doc2feats from ..._ml import chain, Tok2Vec, doc2feats
from ...vocab import Vocab from ...vocab import Vocab
from ...pipeline import TokenVectorEncoder from ...pipeline import Tensorizer
from ...syntax.arc_eager import ArcEager from ...syntax.arc_eager import ArcEager
from ...syntax.nn_parser import Parser from ...syntax.nn_parser import Parser
from ...tokens.doc import Doc from ...tokens.doc import Doc

View File

@ -8,7 +8,7 @@ from ...attrs import NORM
from ...gold import GoldParse from ...gold import GoldParse
from ...vocab import Vocab from ...vocab import Vocab
from ...tokens import Doc from ...tokens import Doc
from ...pipeline import NeuralDependencyParser from ...pipeline import DependencyParser
@pytest.fixture @pytest.fixture
def vocab(): def vocab():
@ -16,7 +16,7 @@ def vocab():
@pytest.fixture @pytest.fixture
def parser(vocab): def parser(vocab):
parser = NeuralDependencyParser(vocab) parser = DependencyParser(vocab)
parser.cfg['token_vector_width'] = 4 parser.cfg['token_vector_width'] = 4
parser.cfg['hidden_width'] = 32 parser.cfg['hidden_width'] = 32
#parser.add_label('right') #parser.add_label('right')

View File

@ -1,11 +1,11 @@
import pytest import pytest
from ...pipeline import NeuralDependencyParser from ...pipeline import DependencyParser
@pytest.fixture @pytest.fixture
def parser(en_vocab): def parser(en_vocab):
parser = NeuralDependencyParser(en_vocab) parser = DependencyParser(en_vocab)
parser.add_label('nsubj') parser.add_label('nsubj')
parser.model, cfg = parser.Model(parser.moves.n_moves) parser.model, cfg = parser.Model(parser.moves.n_moves)
parser.cfg.update(cfg) parser.cfg.update(cfg)
@ -14,7 +14,7 @@ def parser(en_vocab):
@pytest.fixture @pytest.fixture
def blank_parser(en_vocab): def blank_parser(en_vocab):
parser = NeuralDependencyParser(en_vocab) parser = DependencyParser(en_vocab)
return parser return parser

View File

@ -2,8 +2,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from ..util import make_tempdir from ..util import make_tempdir
from ...pipeline import NeuralDependencyParser as DependencyParser from ...pipeline import DependencyParser
from ...pipeline import NeuralEntityRecognizer as EntityRecognizer from ...pipeline import EntityRecognizer
import pytest import pytest

View File

@ -2,7 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from ..util import make_tempdir from ..util import make_tempdir
from ...pipeline import NeuralTagger as Tagger from ...pipeline import Tagger
import pytest import pytest

View File

@ -2,7 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from ..util import make_tempdir from ..util import make_tempdir
from ...pipeline import TokenVectorEncoder as Tensorizer from ...pipeline import Tensorizer
import pytest import pytest