mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge branch 'develop' into feature/dot-underscore
This commit is contained in:
commit
e0ff145a8b
|
@ -311,7 +311,7 @@ def link_vectors_to_models(vocab):
|
||||||
|
|
||||||
def Tok2Vec(width, embed_size, **kwargs):
|
def Tok2Vec(width, embed_size, **kwargs):
|
||||||
pretrained_dims = kwargs.get('pretrained_dims', 0)
|
pretrained_dims = kwargs.get('pretrained_dims', 0)
|
||||||
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
|
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
|
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
|
||||||
'*': reapply}):
|
'*': reapply}):
|
||||||
|
|
|
@ -68,6 +68,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
|
||||||
if not isinstance(meta, dict):
|
if not isinstance(meta, dict):
|
||||||
prints("Expected dict but got: {}".format(type(meta)),
|
prints("Expected dict but got: {}".format(type(meta)),
|
||||||
title="Not a valid meta.json format", exits=1)
|
title="Not a valid meta.json format", exits=1)
|
||||||
|
meta.setdefault('lang', lang)
|
||||||
|
meta.setdefault('name', 'unnamed')
|
||||||
|
|
||||||
pipeline = ['tagger', 'parser', 'ner']
|
pipeline = ['tagger', 'parser', 'ner']
|
||||||
if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
|
if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
|
||||||
|
@ -89,6 +91,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
|
||||||
|
|
||||||
lang_class = util.get_lang_class(lang)
|
lang_class = util.get_lang_class(lang)
|
||||||
nlp = lang_class()
|
nlp = lang_class()
|
||||||
|
meta['pipeline'] = pipeline
|
||||||
|
nlp.meta.update(meta)
|
||||||
if vectors:
|
if vectors:
|
||||||
util.load_model(vectors, vocab=nlp.vocab)
|
util.load_model(vectors, vocab=nlp.vocab)
|
||||||
for name in pipeline:
|
for name in pipeline:
|
||||||
|
|
|
@ -213,7 +213,7 @@ class GoldCorpus(object):
|
||||||
train_tuples = self.train_tuples
|
train_tuples = self.train_tuples
|
||||||
if projectivize:
|
if projectivize:
|
||||||
train_tuples = nonproj.preprocess_training_data(
|
train_tuples = nonproj.preprocess_training_data(
|
||||||
self.train_tuples)
|
self.train_tuples, label_freq_cutoff=100)
|
||||||
random.shuffle(train_tuples)
|
random.shuffle(train_tuples)
|
||||||
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
|
|
|
@ -24,6 +24,8 @@ class Lemmatizer(object):
|
||||||
univ_pos = 'adj'
|
univ_pos = 'adj'
|
||||||
elif univ_pos == PUNCT:
|
elif univ_pos == PUNCT:
|
||||||
univ_pos = 'punct'
|
univ_pos = 'punct'
|
||||||
|
else:
|
||||||
|
return set([string.lower()])
|
||||||
# See Issue #435 for example of where this logic is requied.
|
# See Issue #435 for example of where this logic is requied.
|
||||||
if self.is_base_form(univ_pos, morphology):
|
if self.is_base_form(univ_pos, morphology):
|
||||||
return set([string.lower()])
|
return set([string.lower()])
|
||||||
|
|
|
@ -35,6 +35,8 @@ cdef class Morphology:
|
||||||
cdef RichTagC* rich_tags
|
cdef RichTagC* rich_tags
|
||||||
cdef PreshMapArray _cache
|
cdef PreshMapArray _cache
|
||||||
|
|
||||||
|
cdef int assign_untagged(self, TokenC* token) except -1
|
||||||
|
|
||||||
cdef int assign_tag(self, TokenC* token, tag) except -1
|
cdef int assign_tag(self, TokenC* token, tag) except -1
|
||||||
|
|
||||||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
|
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
|
||||||
|
|
|
@ -42,7 +42,7 @@ cdef class Morphology:
|
||||||
self.tag_names = tuple(sorted(tag_map.keys()))
|
self.tag_names = tuple(sorted(tag_map.keys()))
|
||||||
self.reverse_index = {}
|
self.reverse_index = {}
|
||||||
|
|
||||||
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
|
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
|
||||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||||
self.tag_map[tag_str] = dict(attrs)
|
self.tag_map[tag_str] = dict(attrs)
|
||||||
attrs = _normalize_props(attrs)
|
attrs = _normalize_props(attrs)
|
||||||
|
@ -52,6 +52,10 @@ cdef class Morphology:
|
||||||
self.rich_tags[i].morph = 0
|
self.rich_tags[i].morph = 0
|
||||||
self.rich_tags[i].pos = attrs[POS]
|
self.rich_tags[i].pos = attrs[POS]
|
||||||
self.reverse_index[self.rich_tags[i].name] = i
|
self.reverse_index[self.rich_tags[i].name] = i
|
||||||
|
# Add a 'null' tag, which we can reference when assign morphology to
|
||||||
|
# untagged tokens.
|
||||||
|
self.rich_tags[self.n_tags].id = self.n_tags
|
||||||
|
|
||||||
self._cache = PreshMapArray(self.n_tags)
|
self._cache = PreshMapArray(self.n_tags)
|
||||||
self.exc = {}
|
self.exc = {}
|
||||||
if exc is not None:
|
if exc is not None:
|
||||||
|
@ -62,6 +66,11 @@ cdef class Morphology:
|
||||||
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
|
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
|
||||||
self.exc), None, None)
|
self.exc), None, None)
|
||||||
|
|
||||||
|
cdef int assign_untagged(self, TokenC* token) except -1:
|
||||||
|
'''Set morphological attributes on a token without a POS tag.'''
|
||||||
|
if token.lemma == 0:
|
||||||
|
token.lemma = self.lemmatize(0, token.lex.orth, {})
|
||||||
|
|
||||||
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
||||||
if isinstance(tag, basestring):
|
if isinstance(tag, basestring):
|
||||||
tag = self.strings.add(tag)
|
tag = self.strings.add(tag)
|
||||||
|
@ -72,7 +81,7 @@ cdef class Morphology:
|
||||||
token.tag = tag
|
token.tag = tag
|
||||||
|
|
||||||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
||||||
if tag_id >= self.n_tags:
|
if tag_id > self.n_tags:
|
||||||
raise ValueError("Unknown tag ID: %s" % tag_id)
|
raise ValueError("Unknown tag ID: %s" % tag_id)
|
||||||
# TODO: It's pretty arbitrary to put this logic here. I guess the justification
|
# TODO: It's pretty arbitrary to put this logic here. I guess the justification
|
||||||
# is that this is where the specific word and the tag interact. Still,
|
# is that this is where the specific word and the tag interact. Still,
|
||||||
|
@ -151,8 +160,6 @@ cdef class Morphology:
|
||||||
cdef unicode py_string = self.strings[orth]
|
cdef unicode py_string = self.strings[orth]
|
||||||
if self.lemmatizer is None:
|
if self.lemmatizer is None:
|
||||||
return self.strings.add(py_string.lower())
|
return self.strings.add(py_string.lower())
|
||||||
if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
|
|
||||||
return self.strings.add(py_string.lower())
|
|
||||||
cdef set lemma_strings
|
cdef set lemma_strings
|
||||||
cdef unicode lemma_string
|
cdef unicode lemma_string
|
||||||
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
|
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
|
||||||
|
|
|
@ -239,13 +239,13 @@ cdef class Parser:
|
||||||
"""
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, nr_class, **cfg):
|
def Model(cls, nr_class, **cfg):
|
||||||
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 0))
|
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
|
||||||
token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128))
|
token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 64))
|
||||||
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128))
|
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 64))
|
||||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 3))
|
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2))
|
||||||
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
|
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
|
||||||
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
|
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 4))
|
||||||
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
|
hist_width = util.env_opt('history_width', cfg.get('hist_width', 16))
|
||||||
if hist_size >= 1 and depth == 0:
|
if hist_size >= 1 and depth == 0:
|
||||||
raise ValueError("Inconsistent hyper-params: "
|
raise ValueError("Inconsistent hyper-params: "
|
||||||
"history_feats >= 1 but parser_hidden_depth==0")
|
"history_feats >= 1 but parser_hidden_depth==0")
|
||||||
|
@ -800,16 +800,25 @@ cdef class Parser:
|
||||||
if self.model not in (True, False, None) and resized:
|
if self.model not in (True, False, None) and resized:
|
||||||
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
||||||
# just adding rows here.
|
# just adding rows here.
|
||||||
smaller = self.model[-1]._layers[-1]
|
if self.model[-1].is_noop:
|
||||||
larger = Affine(self.moves.n_moves, smaller.nI)
|
smaller = self.model[1]
|
||||||
copy_array(larger.W[:smaller.nO], smaller.W)
|
dims = dict(self.model[1]._dims)
|
||||||
copy_array(larger.b[:smaller.nO], smaller.b)
|
dims['nO'] = self.moves.n_moves
|
||||||
self.model[-1]._layers[-1] = larger
|
larger = self.model[1].__class__(**dims)
|
||||||
|
copy_array(larger.W[:, :smaller.nO], smaller.W)
|
||||||
|
copy_array(larger.b[:smaller.nO], smaller.b)
|
||||||
|
self.model = (self.model[0], larger, self.model[2])
|
||||||
|
else:
|
||||||
|
smaller = self.model[-1]._layers[-1]
|
||||||
|
larger = Affine(self.moves.n_moves, smaller.nI)
|
||||||
|
copy_array(larger.W[:smaller.nO], smaller.W)
|
||||||
|
copy_array(larger.b[:smaller.nO], smaller.b)
|
||||||
|
self.model[-1]._layers[-1] = larger
|
||||||
|
|
||||||
def begin_training(self, gold_tuples, pipeline=None, **cfg):
|
def begin_training(self, gold_tuples, pipeline=None, **cfg):
|
||||||
if 'model' in cfg:
|
if 'model' in cfg:
|
||||||
self.model = cfg['model']
|
self.model = cfg['model']
|
||||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
|
gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100)
|
||||||
actions = self.moves.get_actions(gold_parses=gold_tuples)
|
actions = self.moves.get_actions(gold_parses=gold_tuples)
|
||||||
for action, labels in actions.items():
|
for action, labels in actions.items():
|
||||||
for label in labels:
|
for label in labels:
|
||||||
|
|
37
spacy/tests/doc/test_creation.py
Normal file
37
spacy/tests/doc/test_creation.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
'''Test Doc sets up tokens correctly.'''
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ...vocab import Vocab
|
||||||
|
from ...tokens.doc import Doc
|
||||||
|
from ...lemmatizerlookup import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def lemmatizer():
|
||||||
|
return Lemmatizer({'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'})
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def vocab(lemmatizer):
|
||||||
|
return Vocab(lemmatizer=lemmatizer)
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_doc(vocab):
|
||||||
|
doc = Doc(vocab)
|
||||||
|
assert len(doc) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_single_word(vocab):
|
||||||
|
doc = Doc(vocab, words=['a'])
|
||||||
|
assert doc.text == 'a '
|
||||||
|
doc = Doc(vocab, words=['a'], spaces=[False])
|
||||||
|
assert doc.text == 'a'
|
||||||
|
|
||||||
|
|
||||||
|
def test_lookup_lemmatization(vocab):
|
||||||
|
doc = Doc(vocab, words=['dogs', 'dogses'])
|
||||||
|
assert doc[0].text == 'dogs'
|
||||||
|
assert doc[0].lemma_ == 'dog'
|
||||||
|
assert doc[1].text == 'dogses'
|
||||||
|
assert doc[1].lemma_ == 'dogses'
|
|
@ -22,14 +22,14 @@ def vocab():
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def parser(vocab):
|
def parser(vocab):
|
||||||
parser = NeuralDependencyParser(vocab)
|
parser = NeuralDependencyParser(vocab)
|
||||||
parser.cfg['token_vector_width'] = 4
|
parser.cfg['token_vector_width'] = 8
|
||||||
parser.cfg['hidden_width'] = 6
|
parser.cfg['hidden_width'] = 30
|
||||||
parser.cfg['hist_size'] = 0
|
parser.cfg['hist_size'] = 0
|
||||||
parser.add_label('left')
|
parser.add_label('left')
|
||||||
parser.begin_training([], **parser.cfg)
|
parser.begin_training([], **parser.cfg)
|
||||||
sgd = Adam(NumpyOps(), 0.001)
|
sgd = Adam(NumpyOps(), 0.001)
|
||||||
|
|
||||||
for i in range(30):
|
for i in range(10):
|
||||||
losses = {}
|
losses = {}
|
||||||
doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
|
doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
|
||||||
gold = GoldParse(doc, heads=[1, 1, 3, 3],
|
gold = GoldParse(doc, heads=[1, 1, 3, 3],
|
||||||
|
@ -37,6 +37,8 @@ def parser(vocab):
|
||||||
parser.update([doc], [gold], sgd=sgd, losses=losses)
|
parser.update([doc], [gold], sgd=sgd, losses=losses)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
def test_init_parser(parser):
|
||||||
|
pass
|
||||||
|
|
||||||
def test_add_label(parser):
|
def test_add_label(parser):
|
||||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||||
|
|
|
@ -64,7 +64,7 @@ def test_sents_1_3(parser):
|
||||||
doc[1].sent_start = True
|
doc[1].sent_start = True
|
||||||
doc[3].sent_start = True
|
doc[3].sent_start = True
|
||||||
doc = parser(doc)
|
doc = parser(doc)
|
||||||
assert len(list(doc.sents)) == 4
|
assert len(list(doc.sents)) >= 3
|
||||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||||
doc[1].sent_start = True
|
doc[1].sent_start = True
|
||||||
doc[2].sent_start = False
|
doc[2].sent_start = False
|
||||||
|
|
|
@ -533,6 +533,8 @@ cdef class Doc:
|
||||||
assert t.lex.orth != 0
|
assert t.lex.orth != 0
|
||||||
t.spacy = has_space
|
t.spacy = has_space
|
||||||
self.length += 1
|
self.length += 1
|
||||||
|
# Set morphological attributes, e.g. by lemma, if possible
|
||||||
|
self.vocab.morphology.assign_untagged(t)
|
||||||
self._py_tokens.append(None)
|
self._py_tokens.append(None)
|
||||||
return t.idx + t.lex.length + t.spacy
|
return t.idx + t.lex.length + t.spacy
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user