Merge github.com:explosion/spaCy into dutch

This commit is contained in:
Janneke van der Zwaan 2016-11-30 17:13:39 +01:00
commit 88869e0e07
26 changed files with 681 additions and 188 deletions

3
.gitignore vendored
View File

@ -93,6 +93,9 @@ coverage.xml
# Mac OS X
*.DS_Store
# Temporary files / Dropbox hack
*.~*
# Komodo project files
*.komodoproject

View File

@ -14,9 +14,11 @@ This is a list of everyone who has made significant contributions to spaCy, in a
* Kendrick Tan, [@kendricktan](https://github.com/kendricktan)
* Kyle P. Johnson, [@kylepjohnson](https://github.com/kylepjohnson)
* Liling Tan, [@alvations](https://github.com/alvations)
* Mark Amery, [@ExplodingCabbage](https://github.com/ExplodingCabbage)
* Matthew Honnibal, [@honnibal](https://github.com/honnibal)
* Maxim Samsonov, [@maxirmx](https://github.com/maxirmx)
* Oleg Zd, [@olegzd](https://github.com/olegzd)
* Pokey Rule, [@pokey](https://github.com/pokey)
* Sam Bozek, [@sambozek](https://github.com/sambozek)
* Sasho Savkov [@savkov](https://github.com/savkov)
* Tiago Rodrigues, [@TiagoMRodrigues](https://github.com/TiagoMRodrigues)

View File

@ -100,7 +100,7 @@ def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False
nlp.entity(tokens)
else:
tokens = nlp(raw_text)
gold = GoldParse(tokens, annot_tuples)
gold = GoldParse.from_annot_tuples(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=verbose)
return scorer

View File

@ -1,3 +1,4 @@
from __future__ import unicode_literals
import plac
import json
from os import path
@ -5,106 +6,25 @@ import shutil
import os
import random
import io
import pathlib
from spacy.syntax.util import Config
from spacy.tokens import Doc
from spacy.syntax.nonproj import PseudoProjectivity
from spacy.language import Language
from spacy.gold import GoldParse
from spacy.tokenizer import Tokenizer
from spacy.vocab import Vocab
from spacy.tagger import Tagger
from spacy.syntax.parser import Parser
from spacy.syntax.arc_eager import ArcEager
from spacy.pipeline import DependencyParser
from spacy.syntax.parser import get_templates
from spacy.syntax.arc_eager import ArcEager
from spacy.scorer import Scorer
import spacy.attrs
from spacy.language import Language
from spacy.tagger import W_orth
TAGGER_TEMPLATES = (
(W_orth,),
)
try:
from codecs import open
except ImportError:
pass
import io
class TreebankParser(object):
@staticmethod
def setup_model_dir(model_dir, labels, templates, feat_set='basic', seed=0):
dep_model_dir = path.join(model_dir, 'deps')
pos_model_dir = path.join(model_dir, 'pos')
if path.exists(dep_model_dir):
shutil.rmtree(dep_model_dir)
if path.exists(pos_model_dir):
shutil.rmtree(pos_model_dir)
os.mkdir(dep_model_dir)
os.mkdir(pos_model_dir)
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
labels=labels)
@classmethod
def from_dir(cls, tag_map, model_dir):
vocab = Vocab(tag_map=tag_map, get_lex_attr=Language.default_lex_attrs())
vocab.get_lex_attr[spacy.attrs.LANG] = lambda _: 0
tokenizer = Tokenizer(vocab, {}, None, None, None)
tagger = Tagger.blank(vocab, TAGGER_TEMPLATES)
cfg = Config.read(path.join(model_dir, 'deps'), 'config')
parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings, ArcEager)
return cls(vocab, tokenizer, tagger, parser)
def __init__(self, vocab, tokenizer, tagger, parser):
self.vocab = vocab
self.tokenizer = tokenizer
self.tagger = tagger
self.parser = parser
def train(self, words, tags, heads, deps):
tokens = self.tokenizer.tokens_from_list(list(words))
self.tagger.train(tokens, tags)
tokens = self.tokenizer.tokens_from_list(list(words))
ids = range(len(words))
ner = ['O'] * len(words)
gold = GoldParse(tokens, ((ids, words, tags, heads, deps, ner)),
make_projective=False)
self.tagger(tokens)
if gold.is_projective:
try:
self.parser.train(tokens, gold)
except:
for id_, word, head, dep in zip(ids, words, heads, deps):
print(id_, word, head, dep)
raise
def __call__(self, words, tags=None):
tokens = self.tokenizer.tokens_from_list(list(words))
if tags is None:
self.tagger(tokens)
else:
self.tagger.tag_from_strings(tokens, tags)
self.parser(tokens)
return tokens
def end_training(self, data_dir):
self.parser.model.end_training()
self.parser.model.dump(path.join(data_dir, 'deps', 'model'))
self.tagger.model.end_training()
self.tagger.model.dump(path.join(data_dir, 'pos', 'model'))
strings_loc = path.join(data_dir, 'vocab', 'strings.json')
with io.open(strings_loc, 'w', encoding='utf8') as file_:
self.vocab.strings.dump(file_)
self.vocab.dump(path.join(data_dir, 'vocab', 'lexemes.bin'))
def read_conllx(loc):
with open(loc, 'r', 'utf8') as file_:
with io.open(loc, 'r', encoding='utf8') as file_:
text = file_.read()
for sent in text.strip().split('\n\n'):
lines = sent.strip().split('\n')
@ -113,24 +33,31 @@ def read_conllx(loc):
lines.pop(0)
tokens = []
for line in lines:
id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split()
id_, word, lemma, tag, pos, morph, head, dep, _1, _2 = line.split()
if '-' in id_:
continue
id_ = int(id_) - 1
head = (int(head) - 1) if head != '0' else id_
dep = 'ROOT' if dep == 'root' else dep
tokens.append((id_, word, tag, head, dep, 'O'))
tuples = zip(*tokens)
yield (None, [(tuples, [])])
try:
id_ = int(id_) - 1
head = (int(head) - 1) if head != '0' else id_
dep = 'ROOT' if dep == 'root' else dep
tokens.append((id_, word, tag, head, dep, 'O'))
except:
print(line)
raise
tuples = [list(t) for t in zip(*tokens)]
yield (None, [[tuples, []]])
def score_model(nlp, gold_docs, verbose=False):
def score_model(vocab, tagger, parser, gold_docs, verbose=False):
scorer = Scorer()
for _, gold_doc in gold_docs:
for annot_tuples, _ in gold_doc:
tokens = nlp(list(annot_tuples[1]), tags=list(annot_tuples[2]))
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=verbose)
for (ids, words, tags, heads, deps, entities), _ in gold_doc:
doc = Doc(vocab, words=words)
tagger(doc)
parser(doc)
PseudoProjectivity.deprojectivize(doc)
gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
scorer.score(doc, gold, verbose=verbose)
return scorer
@ -138,22 +65,45 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc):
with open(tag_map_loc) as file_:
tag_map = json.loads(file_.read())
train_sents = list(read_conllx(train_loc))
labels = ArcEager.get_labels(train_sents)
templates = get_templates('basic')
train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
TreebankParser.setup_model_dir(model_dir, labels, templates)
actions = ArcEager.get_actions(gold_parses=train_sents)
features = get_templates('basic')
nlp = TreebankParser.from_dir(tag_map, model_dir)
model_dir = pathlib.Path(model_dir)
with (model_dir / 'deps' / 'config.json').open('w') as file_:
json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_)
vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map)
# Populate vocab
for _, doc_sents in train_sents:
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
for word in words:
_ = vocab[word]
for dep in deps:
_ = vocab[dep]
for tag in tags:
_ = vocab[tag]
for tag in tags:
assert tag in tag_map, repr(tag)
tagger = Tagger(vocab, tag_map=tag_map)
parser = DependencyParser(vocab, actions=actions, features=features)
for itn in range(15):
for _, doc_sents in train_sents:
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
nlp.train(words, tags, heads, deps)
doc = Doc(vocab, words=words)
gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
tagger(doc)
parser.update(doc, gold)
doc = Doc(vocab, words=words)
tagger.update(doc, gold)
random.shuffle(train_sents)
scorer = score_model(nlp, read_conllx(dev_loc))
scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc))
nlp = Language(vocab=vocab, tagger=tagger, parser=parser)
nlp.end_training(model_dir)
scorer = score_model(nlp, read_conllx(dev_loc))
scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))

View File

@ -86,5 +86,48 @@ IDS = {
"LANG": LANG,
}
# ATTR IDs, in order of the symbol
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
'''Normalize a dictionary of attributes, converting them to ints.
Arguments:
stringy_attrs (dict):
Dictionary keyed by attribute string names. Values can be ints or strings.
strings_map (StringStore):
Defaults to None. If provided, encodes string values into ints.
Returns:
inty_attrs (dict):
Attributes dictionary with keys and optionally values converted to
ints.
'''
inty_attrs = {}
if _do_deprecated:
if 'F' in stringy_attrs:
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
if 'L' in stringy_attrs:
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
if 'pos' in stringy_attrs:
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
if 'morph' in stringy_attrs:
morphs = stringy_attrs.pop('morph')
if 'number' in stringy_attrs:
stringy_attrs.pop('number')
if 'tenspect' in stringy_attrs:
stringy_attrs.pop('tenspect')
# for name, value in morphs.items():
# stringy_attrs[name] = value
for name, value in stringy_attrs.items():
if isinstance(name, int):
int_key = name
else:
int_key = IDS[name.upper()]
if strings_map is not None and isinstance(value, basestring):
value = strings_map[value]
inty_attrs[int_key] = value
return inty_attrs

View File

@ -19,6 +19,7 @@ cdef class GoldParse:
cdef int length
cdef readonly int loss
cdef readonly list words
cdef readonly list tags
cdef readonly list heads
cdef readonly list labels

View File

@ -19,6 +19,8 @@ def tags_to_entities(tags):
entities = []
start = None
for i, tag in enumerate(tags):
if tag is None:
continue
if tag.startswith('O'):
# TODO: We shouldn't be getting these malformed inputs. Fix this.
if start is not None:
@ -249,7 +251,7 @@ cdef class GoldParse:
if deps is None:
deps = [None for _ in doc]
if entities is None:
entities = [None for _ in doc]
entities = ['-' for _ in doc]
elif len(entities) == 0:
entities = ['O' for _ in doc]
elif not isinstance(entities[0], basestring):
@ -266,6 +268,7 @@ cdef class GoldParse:
self.c.labels = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
self.words = [None] * len(doc)
self.tags = [None] * len(doc)
self.heads = [None] * len(doc)
self.labels = [''] * len(doc)
@ -279,6 +282,7 @@ cdef class GoldParse:
for i, gold_i in enumerate(self.cand_to_gold):
if doc[i].text.isspace():
self.words[i] = doc[i].text
self.tags[i] = 'SP'
self.heads[i] = None
self.labels[i] = None
@ -286,6 +290,7 @@ cdef class GoldParse:
if gold_i is None:
pass
else:
self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i]
self.heads[i] = self.gold_to_cand[heads[gold_i]]
self.labels[i] = deps[gold_i]

View File

@ -5,10 +5,7 @@ import pathlib
from contextlib import contextmanager
import shutil
try:
import ujson as json
except ImportError:
import json
import ujson as json
try:
@ -31,6 +28,8 @@ from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP
from .syntax.parser import get_templates
from .syntax.nonproj import PseudoProjectivity
from .pipeline import DependencyParser, EntityRecognizer
from .syntax.arc_eager import ArcEager
from .syntax.ner import BiluoPushDown
class BaseDefaults(object):
@ -80,7 +79,7 @@ class BaseDefaults(object):
else:
infix_finditer = None
vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
return Tokenizer(nlp.vocab, rules=rules,
return Tokenizer(vocab, rules=rules,
prefix_search=prefix_search, suffix_search=suffix_search,
infix_finditer=infix_finditer)
@ -96,26 +95,27 @@ class BaseDefaults(object):
return Tagger.load(nlp.path / 'pos', nlp.vocab)
@classmethod
def create_parser(cls, nlp=None):
def create_parser(cls, nlp=None, **cfg):
if nlp is None:
return DependencyParser(cls.create_vocab(), features=cls.parser_features)
return DependencyParser(cls.create_vocab(), features=cls.parser_features,
**cfg)
elif nlp.path is False:
return DependencyParser(nlp.vocab, features=cls.parser_features)
return DependencyParser(nlp.vocab, features=cls.parser_features, **cfg)
elif nlp.path is None or not (nlp.path / 'deps').exists():
return None
else:
return DependencyParser.load(nlp.path / 'deps', nlp.vocab)
return DependencyParser.load(nlp.path / 'deps', nlp.vocab, **cfg)
@classmethod
def create_entity(cls, nlp=None):
def create_entity(cls, nlp=None, **cfg):
if nlp is None:
return EntityRecognizer(cls.create_vocab(), features=cls.entity_features)
return EntityRecognizer(cls.create_vocab(), features=cls.entity_features, **cfg)
elif nlp.path is False:
return EntityRecognizer(nlp.vocab, features=cls.entity_features)
return EntityRecognizer(nlp.vocab, features=cls.entity_features, **cfg)
elif nlp.path is None or not (nlp.path / 'ner').exists():
return None
else:
return EntityRecognizer.load(nlp.path / 'ner', nlp.vocab)
return EntityRecognizer.load(nlp.path / 'ner', nlp.vocab, **cfg)
@classmethod
def create_matcher(cls, nlp=None):
@ -216,14 +216,14 @@ class Language(object):
# preprocess training data here before ArcEager.get_labels() is called
gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
parser_cfg['labels'] = ArcEager.get_labels(gold_tuples)
entity_cfg['labels'] = BiluoPushDown.get_labels(gold_tuples)
parser_cfg['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
entity_cfg['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)
with (dep_model_dir / 'config.json').open('wb') as file_:
with (dep_model_dir / 'config.json').open('w') as file_:
json.dump(parser_cfg, file_)
with (ner_model_dir / 'config.json').open('wb') as file_:
with (ner_model_dir / 'config.json').open('w') as file_:
json.dump(entity_cfg, file_)
with (pos_model_dir / 'config.json').open('wb') as file_:
with (pos_model_dir / 'config.json').open('w') as file_:
json.dump(tagger_cfg, file_)
self = cls(
@ -238,15 +238,12 @@ class Language(object):
vectors=False,
pipeline=False)
self.defaults.parser_labels = parser_cfg['labels']
self.defaults.entity_labels = entity_cfg['labels']
self.vocab = self.defaults.Vocab()
self.tokenizer = self.defaults.Tokenizer(self.vocab)
self.tagger = self.defaults.Tagger(self.vocab, **tagger_cfg)
self.parser = self.defaults.Parser(self.vocab, **parser_cfg)
self.entity = self.defaults.Entity(self.vocab, **entity_cfg)
self.pipeline = self.defaults.Pipeline(self)
self.vocab = self.Defaults.create_vocab(self)
self.tokenizer = self.Defaults.create_tokenizer(self)
self.tagger = self.Defaults.create_tagger(self)
self.parser = self.Defaults.create_parser(self)
self.entity = self.Defaults.create_entity(self)
self.pipeline = self.Defaults.create_pipeline(self)
yield Trainer(self, gold_tuples)
self.end_training()
@ -267,7 +264,7 @@ class Language(object):
add_vectors = self.Defaults.add_vectors(self) \
if 'add_vectors' not in overrides \
else overrides['add_vectors']
if add_vectors:
if self.vocab and add_vectors:
add_vectors(self.vocab)
self.tokenizer = self.Defaults.create_tokenizer(self) \
if 'tokenizer' not in overrides \
@ -387,7 +384,7 @@ class Language(object):
else:
entity_iob_freqs = []
entity_type_freqs = []
with (path / 'vocab' / 'serializer.json').open('wb') as file_:
with (path / 'vocab' / 'serializer.json').open('w') as file_:
file_.write(
json.dumps([
(TAG, tagger_freqs),

View File

@ -87,7 +87,7 @@ class Scorer(object):
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
for id_, word, tag, head, dep, ner in gold.orig_annot:
gold_tags.add((id_, tag))
if dep.lower() not in punct_labels:
if dep is not None and dep.lower() not in punct_labels:
gold_deps.add((id_, head, dep.lower()))
cand_deps = set()
cand_tags = set()

View File

@ -439,7 +439,7 @@ cdef class ArcEager(TransitionSystem):
if move_costs[move] == -1:
move_costs[move] = move_cost_funcs[move](stcls, &gold.c)
costs[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label)
n_gold += costs[i] == 0
n_gold += costs[i] <= 0
else:
is_valid[i] = False
costs[i] = 9000
@ -456,8 +456,14 @@ cdef class ArcEager(TransitionSystem):
"before training and after parsing. Either pass make_projective=True "
"to the GoldParse class, or use PseudoProjectivity.preprocess_training_data")
else:
print(gold.words)
print(gold.heads)
print(gold.labels)
raise ValueError(
"Could not find a gold-standard action to supervise the dependency "
"parser.\n"
"The GoldParse was projective.")
"The GoldParse was projective.\n"
"The transition system has %d actions.\n"
"State at failure:\n"
"%s" % (self.n_moves, stcls.print_state(gold.words)))
assert n_gold >= 1

View File

@ -1,13 +1,16 @@
from spacy.parts_of_speech cimport NOUN, PROPN, PRON
def english_noun_chunks(doc):
def english_noun_chunks(obj):
'''Detect base noun phrases from a dependency parse.
Works on both Doc and Span.'''
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
'attr', 'ROOT', 'root']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings['conj']
np_label = doc.vocab.strings['NP']
for i, word in enumerate(doc):
for i, word in enumerate(obj):
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
yield word.left_edge.i, word.i+1, np_label
elif word.pos == NOUN and word.dep == conj:
@ -25,14 +28,15 @@ def english_noun_chunks(doc):
# extended to the right of the NOUN
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
# just "eine Tasse", same for "das Thema Familie"
def german_noun_chunks(doc):
def german_noun_chunks(obj):
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
doc = obj.doc # Ensure works on both Doc and Span.
np_label = doc.vocab.strings['NP']
np_deps = set(doc.vocab.strings[label] for label in labels)
close_app = doc.vocab.strings['nk']
rbracket = 0
for i, word in enumerate(doc):
for i, word in enumerate(obj):
if i < rbracket:
continue
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:

View File

@ -65,7 +65,7 @@ cdef class BiluoPushDown(TransitionSystem):
for action in (BEGIN, IN, LAST, UNIT):
actions[action][entity_type] = True
moves = ('M', 'B', 'I', 'L', 'U')
for raw_text, sents in kwargs.get('gold_tuples', []):
for raw_text, sents in kwargs.get('gold_parses', []):
for (ids, words, tags, heads, labels, biluo), _ in sents:
for i, ner_tag in enumerate(biluo):
if ner_tag != 'O' and ner_tag != '-':

View File

@ -1,3 +1,4 @@
from __future__ import unicode_literals
from copy import copy
from ..tokens.doc cimport Doc

View File

@ -76,7 +76,7 @@ cdef class ParserModel(AveragedPerceptron):
cdef class Parser:
"""Base class of the DependencyParser and EntityRecognizer."""
@classmethod
def load(cls, path, Vocab vocab, TransitionSystem=None, require=False):
def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg):
"""Load the statistical model from the supplied path.
Arguments:
@ -92,7 +92,7 @@ cdef class Parser:
with (path / 'config.json').open() as file_:
cfg = json.load(file_)
# TODO: remove this shim when we don't have to support older data
if 'labels' in cfg:
if 'labels' in cfg and 'actions' not in cfg:
cfg['actions'] = cfg.pop('labels')
self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg)
if (path / 'model').exists():
@ -266,7 +266,7 @@ cdef class Parser:
loss += eg.costs[eg.guess]
eg.fill_scores(0, eg.nr_class)
eg.fill_costs(0, eg.nr_class)
eg.fill_is_valid(0, eg.nr_class)
eg.fill_is_valid(1, eg.nr_class)
return loss
def step_through(self, Doc doc):

View File

@ -0,0 +1,32 @@
from ...attrs import *
def test_key_no_value():
int_attrs = intify_attrs({"ORTH": "dog"})
assert int_attrs == {ORTH: "dog"}
def test_lower_key():
int_attrs = intify_attrs({"norm": "dog"})
assert int_attrs == {NORM: "dog"}
def test_lower_key_value():
vals = {'dog': 10}
int_attrs = intify_attrs({"lemma": "dog"}, strings_map=vals)
assert int_attrs == {LEMMA: 10}
def test_idempotence():
vals = {'dog': 10}
int_attrs = intify_attrs({"lemma": "dog", 'is_alpha': True}, strings_map=vals)
int_attrs = intify_attrs(int_attrs)
assert int_attrs == {LEMMA: 10, IS_ALPHA: True}
def test_do_deprecated():
vals = {'dog': 10}
int_attrs = intify_attrs({"F": "dog", 'is_alpha': True}, strings_map=vals,
_do_deprecated=True)
assert int_attrs == {ORTH: 10, IS_ALPHA: True}

View File

@ -0,0 +1,48 @@
from __future__ import unicode_literals
import pytest
import re
from ...vocab import Vocab
from ...tokenizer import Tokenizer
@pytest.fixture
def vocab():
return Vocab(tag_map={'NN': {'pos': 'NOUN'}})
@pytest.fixture
def rules():
return {}
@pytest.fixture
def prefix_search():
return None
@pytest.fixture
def suffix_search():
return None
@pytest.fixture
def infix_finditer():
return None
@pytest.fixture
def tokenizer(vocab, rules, prefix_search, suffix_search, infix_finditer):
return Tokenizer(vocab, rules, prefix_search, suffix_search, infix_finditer)
def test_add_special_case(tokenizer):
tokenizer.add_special_case('dog', [{'orth': 'd'}, {'orth': 'og'}])
doc = tokenizer('dog')
assert doc[0].text == 'd'
assert doc[1].text == 'og'
def test_special_case_tag(tokenizer):
tokenizer.add_special_case('dog', [{'orth': 'd', 'tag': 'NN'}, {'orth': 'og'}])
doc = tokenizer('dog')
assert doc[0].text == 'd'
assert doc[0].tag_ == 'NN'
assert doc[0].pos_ == 'NOUN'
assert doc[1].text == 'og'

View File

@ -223,6 +223,10 @@ cdef class Doc:
def __repr__(self):
return self.__str__()
@property
def doc(self):
return self
def similarity(self, other):
'''Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors.

View File

@ -190,6 +190,31 @@ cdef class Span:
def __get__(self):
return u''.join([t.text_with_ws for t in self])
property noun_chunks:
'''
Yields base noun-phrase #[code Span] objects, if the document
has been syntactically parsed. A base noun phrase, or
'NP chunk', is a noun phrase that does not permit other NPs to
be nested within it so no NP-level coordination, no prepositional
phrases, and no relative clauses. For example:
'''
def __get__(self):
if not self.doc.is_parsed:
raise ValueError(
"noun_chunks requires the dependency parse, which "
"requires data to be installed. If you haven't done so, run: "
"\npython -m spacy.%s.download all\n"
"to install the data" % self.vocab.lang)
# Accumulate the result before beginning to iterate over it. This prevents
# the tokenisation from being changed out from under us during the iteration.
# The tricky thing here is that Span accepts its tokenisation changing,
# so it's okay once we have the Span objects. See Issue #375
spans = []
for start, end, label in self.doc.noun_chunks_iterator(self):
spans.append(Span(self, start, end, label=label))
for span in spans:
yield span
property root:
"""The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered.

View File

@ -1,8 +1,11 @@
from numpy cimport ndarray
from ..vocab cimport Vocab
from ..structs cimport TokenC
from ..attrs cimport attr_id_t
from ..attrs cimport *
from ..typedefs cimport attr_t, flags_t
from ..parts_of_speech cimport univ_pos_t
from .doc cimport Doc
from ..lexeme cimport Lexeme
cdef class Token:
@ -22,4 +25,51 @@ cdef class Token:
doc._py_tokens[offset] = self
return self
#cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
# cdef TokenC token
# attrs = normalize_attrs(attrs)
cpdef bint check_flag(self, attr_id_t flag_id) except -1
@staticmethod
cdef inline attr_t get_struct_attr(const TokenC* token, attr_id_t feat_name) nogil:
if feat_name < (sizeof(flags_t) * 8):
return Lexeme.c_check_flag(token.lex, feat_name)
elif feat_name == LEMMA:
return token.lemma
elif feat_name == POS:
return token.pos
elif feat_name == TAG:
return token.tag
elif feat_name == DEP:
return token.dep
elif feat_name == HEAD:
return token.head
elif feat_name == SPACY:
return token.spacy
elif feat_name == ENT_IOB:
return token.ent_iob
elif feat_name == ENT_TYPE:
return token.ent_type
else:
return Lexeme.get_struct_attr(token.lex, feat_name)
@staticmethod
cdef inline attr_t set_struct_attr(TokenC* token, attr_id_t feat_name,
attr_t value) nogil:
if feat_name == LEMMA:
token.lemma = value
elif feat_name == POS:
token.pos = <univ_pos_t>value
elif feat_name == TAG:
token.tag = value
elif feat_name == DEP:
token.dep = value
elif feat_name == HEAD:
token.head = value
elif feat_name == SPACY:
token.spacy = value
elif feat_name == ENT_IOB:
token.ent_iob = value
elif feat_name == ENT_TYPE:
token.ent_type = value

View File

@ -14,22 +14,31 @@ class Trainer(object):
self.gold_tuples = gold_tuples
def epochs(self, nr_epoch, augment_data=None, gold_preproc=False):
def _epoch():
for raw_text, paragraph_tuples in self.gold_tuples:
cached_golds = {}
def _epoch(indices):
for i in indices:
raw_text, paragraph_tuples = self.gold_tuples[i]
if gold_preproc:
raw_text = None
else:
paragraph_tuples = merge_sents(paragraph_tuples)
if augment_data is not None:
if augment_data is None:
docs = self.make_docs(raw_text, paragraph_tuples)
if i in cached_golds:
golds = cached_golds[i]
else:
golds = self.make_golds(docs, paragraph_tuples)
else:
raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples)
docs = self.make_docs(raw_text, paragraph_tuples)
golds = self.make_golds(docs, paragraph_tuples)
docs = self.make_docs(raw_text, paragraph_tuples)
golds = self.make_golds(docs, paragraph_tuples)
for doc, gold in zip(docs, golds):
yield doc, gold
indices = list(range(len(self.gold_tuples)))
for itn in range(nr_epoch):
random.shuffle(self.gold_tuples)
yield _epoch()
random.shuffle(indices)
yield _epoch(indices)
def update(self, doc, gold):
for process in self.nlp.pipeline:
@ -62,8 +71,8 @@ class Trainer(object):
def make_golds(self, docs, paragraph_tuples):
if len(docs) == 1:
return [GoldParse(docs[0], sent_tuples[0])
return [GoldParse.from_annot_tuples(docs[0], sent_tuples[0])
for sent_tuples in paragraph_tuples]
else:
return [GoldParse(doc, sent_tuples[0])
return [GoldParse.from_annot_tuples(doc, sent_tuples[0])
for doc, sent_tuples in zip(docs, paragraph_tuples)]

View File

@ -20,6 +20,8 @@ from .orth cimport word_shape
from .typedefs cimport attr_t
from .cfile cimport CFile
from .lemmatizer import Lemmatizer
from .attrs import intify_attrs
from .tokens.token cimport Token
from . import attrs
from . import symbols
@ -336,16 +338,14 @@ cdef class Vocab:
cdef int i
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings):
props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True)
token = &tokens[i]
# Set the special tokens up to have morphology and lemmas if
# specified, otherwise use the part-of-speech tag (if specified)
token.lex = <LexemeC*>self.get(self.mem, props['F'])
if 'pos' in props:
self.morphology.assign_tag(token, props['pos'])
if 'L' in props:
tokens[i].lemma = self.strings[props['L']]
for feature, value in props.get('morph', {}).items():
self.morphology.assign_feature(&token.morph, feature, value)
# Set the special tokens up to have arbitrary attributes
token.lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH])
if attrs.TAG in props:
self.morphology.assign_tag(token, props[attrs.TAG])
for attr_id, value in props.items():
Token.set_struct_attr(token, attr_id, value)
return tokens
def dump(self, loc):

View File

@ -21,7 +21,8 @@
"SOCIAL": {
"twitter": "spacy_io",
"github": "explosion",
"reddit": "spacynlp"
"reddit": "spacynlp",
"codepen": "explosion"
},
"NAVIGATION": {

View File

@ -90,6 +90,19 @@ mixin code(label, language)
block
//- CodePen embed
slug - [string] ID of CodePen demo (taken from URL)
height - [integer] height of demo embed iframe
default_tab - [string] code tab(s) visible on load (default: "result")
mixin codepen(slug, height, default_tab)
figure.o-block(style="min-height: #{height}px")&attributes(attributes)
.codepen(data-height=height data-theme-id="26467" data-slug-hash=slug data-default-tab=(default_tab || "result") data-embed-version="2" data-user=SOCIAL.codepen)
+a("https://codepen.io/" + SOCIAL.codepen + "/" + slug) View on CodePen
script(async src="https://assets.codepen.io/assets/embed/ei.js")
//- Images / figures
url - [string] url or path to image
width - [integer] image width in px, for better rendering (default: 500)

View File

@ -71,7 +71,7 @@ p
+h(2, "named-entities") Named Entity Recognition
+table(["Entity Type", "Description"])
+table([ "Type", "Description" ])
+row
+cell #[code PERSON]
+cell People, including fictional.
@ -81,8 +81,8 @@ p
+cell Nationalities or religious or political groups.
+row
+cell #[code FAC]
+cell Facilities, such as buildings, airports, highways, bridges, etc.
+cell #[code FACILITY]
+cell Buildings, airports, highways, bridges, etc.
+row
+cell #[code ORG]
@ -98,7 +98,7 @@ p
+row
+cell #[code PRODUCT]
+cell Vehicles, weapons, foods, etc. (Not services)
+cell Objects, vehicles, foods, etc. (Not services.)
+row
+cell #[code EVENT]
@ -108,41 +108,37 @@ p
+cell #[code WORK_OF_ART]
+cell Titles of books, songs, etc.
+row
+cell #[code LAW]
+cell Named documents made into laws
+row
+cell #[code LANGUAGE]
+cell Any named language
+cell Any named language.
p The following values are also annotated in a style similar to names:
+table(["Entity Type", "Description"])
+table([ "Type", "Description" ])
+row
+cell #[code DATE]
+cell Absolute or relative dates or periods
+cell Absolute or relative dates or periods.
+row
+cell #[code TIME]
+cell Times smaller than a day
+cell Times smaller than a day.
+row
+cell #[code PERCENT]
+cell Percentage (including “%”)
+cell Percentage, including "%".
+row
+cell #[code MONEY]
+cell Monetary values, including unit
+cell Monetary values, including unit.
+row
+cell #[code QUANTITY]
+cell Measurements, as of weight or distance
+cell Measurements, as of weight or distance.
+row
+cell #[code ORDINAL]
+cell "first", "second"
+cell "first", "second", etc.
+row
+cell #[code CARDINAL]
+cell Numerals that do not fall under another type
+cell Numerals that do not fall under another type.

View File

@ -9,6 +9,7 @@
"Processing text": "processing-text",
"spaCy's data model": "data-model",
"Using the parse": "dependency-parse",
"Entity recognition": "entity-recognition",
"Custom pipelines": "customizing-pipeline",
"Rule-based matching": "rule-based-matching",
"Word vectors": "word-vectors-similarities",
@ -51,7 +52,13 @@
},
"dependency-parse": {
"title": "Using the dependency parse"
"title": "Using the dependency parse",
"next": "entity-recognition"
},
"entity-recognition": {
"title": "Entity recognition",
"next": "rule-based-matching"
},
"rule-based-matching": {
@ -232,6 +239,12 @@
},
"deep_dives": {
"Modern NLP in Python What you can learn about food by analyzing a million Yelp reviews": {
"url": "http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb",
"author": "Patrick Harrison (S&P Global)",
"tags": [ "jupyter", "gensim" ]
},
"Deep Learning with custom pipelines and Keras": {
"url": "https://explosion.ai/blog/spacy-deep-learning-keras",
"author": "Matthew Honnibal",

View File

@ -0,0 +1,290 @@
//- 💫 DOCS > USAGE > NAMED ENTITY RECOGNITION
include ../../_includes/_mixins
p
| spaCy features an extremely fast statistical entity recognition system,
| that assigns labels to contiguous spans of tokens. The default model
| identifies a variety of named and numeric entities, including companies,
| locations, organizations and products. You can add arbitrary classes to
| the entity recognition system, and update the model with new examples.
+aside-code("Example").
import spacy
nlp = spacy.load('en')
doc = nlp(u'London is a big city in the United Kingdom.')
for ent in doc.ents:
print(ent.label_, ent.text)
# GPE London
# GPE United Kingdom
p
| The standard way to access entity annotations is the
| #[+api("doc#ents") #[code doc.ents]] property, which produces a sequence
| of #[+api("span") #[code Span]] objects. The entity type is accessible
| either as an integer ID or as a string, using the attributes
| #[code ent.label] and #[code ent.label_]. The #[code Span] object acts
| as a sequence of tokens, so you can iterate over the entity or index into
| it. You can also get the text form of the whole entity, as though it were
| a single token. See the #[+api("span") API reference] for more details.
p
| You can access token entity annotations using the #[code token.ent_iob]
| and #[code token.ent_type] attributes. The #[code token.ent_iob]
| attribute indicates whether an entity starts, continues or ends on the
| tag (In, Begin, Out).
+code("Example").
doc = nlp(u'London is a big city in the United Kingdom.')
print(doc[0].text, doc[0].ent_iob, doc[0].ent_type_)
# (u'London', 2, u'GPE')
print(doc[1].text, doc[1].ent_iob, doc[1].ent_type_)
# (u'is', 3, u'')
+h(2, "setting") Setting entity annotations
p
| To ensure that the sequence of token annotations remains consistent, you
| have to set entity annotations at the document level — you can't write
| directly to the #[code token.ent_iob] or #[code token.ent_type]
| attributes. The easiest way to set entities is to assign to the
| #[code doc.ents] attribute.
+code("Example").
doc = nlp(u'London is a big city in the United Kingdom.')
doc.ents = []
assert doc[0].ent_type_ == ''
doc.ents = [Span(0, 1, label='GPE')]
assert doc[0].ent_type_ == 'GPE'
doc.ents = []
doc.ents = [(u'LondonCity', 0, 1, u'GPE')]
p
| The value you assign should be a sequence, the values of which
| can either be #[code Span] objects, or #[code (ent_id, ent_type, start, end)]
| tuples, where #[code start] and #[code end] are token offsets that
| describe the slice of the document that should be annotated.
p
| You can also assign entity annotations using the #[code doc.from_array()]
| method. To do this, you should include both the #[code ENT_TYPE] and the
| #[code ENT_IOB] attributes in the array you're importing from.
+code("Example").
from spacy.attrs import ENT_IOB, ENT_TYPE
import numpy
doc = nlp.make_doc(u'London is a big city in the United Kingdom.')
assert list(doc.ents) == []
header = [ENT_IOB, ENT_TYPE]
attr_array = numpy.zeros((len(doc), len(header)))
attr_array[0, 0] = 2 # B
attr_array[0, 1] = doc.vocab.strings[u'GPE']
doc.from_array(header, attr_array)
assert list(doc.ents)[0].text == u'London'
p
| Finally, you can always write to the underlying struct, if you compile
| a Cython function. This is easy to do, and allows you to write efficient
| native code.
+code("Example").
# cython: infer_types=True
from spacy.tokens.doc cimport Doc
cpdef set_entity(Doc doc, int start, int end, int ent_type):
for i in range(start, end):
doc.c[i].ent_type = ent_type
doc.c[start].ent_iob = 3
for i in range(start+1, end):
doc.c[i].ent_iob = 2
p
| Obviously, if you write directly to the array of #[code TokenC*] structs,
| you'll have responsibility for ensuring that the data is left in a
| consistent state.
+h(2, "displacy") The displaCy #[sup ENT] visualizer
p
| The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer]
| lets you explore an entity recognition model's behaviour interactively.
| If you're training a model, it's very useful to run the visualization
| server yourself. To help you do that, we've open-sourced both the
| #[+a(gh("spacy-services")) back-end service] and the
| #[+a(gh("displacy-ent")) front-end client].
+codepen("ALxpQO", 450)
+h(2, "entity-types") Built-in entity types
+h(3, "entity-types-named") Named types
+table([ "Type", "Description" ])
+row
+cell #[code PERSON]
+cell People, including fictional.
+row
+cell #[code NORP]
+cell Nationalities or religious or political groups.
+row
+cell #[code FACILITY]
+cell Buildings, airports, highways, bridges, etc.
+row
+cell #[code ORG]
+cell Companies, agencies, institutions, etc.
+row
+cell #[code GPE]
+cell Countries, cities, states.
+row
+cell #[code LOC]
+cell Non-GPE locations, mountain ranges, bodies of water.
+row
+cell #[code PRODUCT]
+cell Objects, vehicles, foods, etc. (Not services.)
+row
+cell #[code EVENT]
+cell Named hurricanes, battles, wars, sports events, etc.
+row
+cell #[code WORK_OF_ART]
+cell Titles of books, songs, etc.
+row
+cell #[code LANGUAGE]
+cell Any named language
+h(3, "entity-types-numeric") Numeric types
+table([ "Type", "Description" ])
+row
+cell #[code DATE]
+cell Absolute or relative dates or periods.
+row
+cell #[code TIME]
+cell Times smaller than a day.
+row
+cell #[code PERCENT]
+cell Percentage, including "%".
+row
+cell #[code MONEY]
+cell Monetary values, including unit.
+row
+cell #[code QUANTITY]
+cell Measurements, as of weight or distance.
+row
+cell #[code ORDINAL]
+cell "first", "second", etc.
+row
+cell #[code CARDINAL]
+cell Numerals that do not fall under another type.
+aside("Install")
| The #[+api("load") spacy.load()] function configures a pipeline that
| includes all of the available annotators for the given ID. In the example
| above, the #[code 'en'] ID tells spaCy to load the default English
| pipeline. If you have installed the data with
| #[code python -m spacy.en.download] this will include the entity
| recognition model.
+h(2, "updating") Training and updating
p
| To provide training examples to the entity recogniser, you'll first need
| to create an instance of the #[code GoldParse] class. You can specify
| your annotations in a stand-off format or as token tags.
+code.
import spacy
from spacy.gold import GoldParse
train_data = [
('Who is Chaka Khan?', [(7, 17, 'PERSON')]),
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])
]
nlp = spacy.load(entity=False, parser=False)
ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC'])
for itn in range(5):
random.shuffle(train_data)
for raw_text, entity_offsets in train_data:
doc = nlp.make_doc(raw_text)
gold = GoldParse(doc, entities=entity_offsets)
nlp.tagger(doc)
ner.update(doc, gold)
ner.model.end_training()
p
| If a character offset in your entity annotations don't fall on a token
| boundary, the #[code GoldParse] class will treat that annotation as a
| missing value. This allows for more realistic training, because the
| entity recogniser is allowed to learn from examples that may feature
| tokenizer errors.
+aside-code("Example").
doc = Doc(nlp.vocab, [u'rats', u'make', u'good', u'pets'])
gold = GoldParse(doc, [u'U-ANIMAL', u'O', u'O', u'O'])
ner = EntityRecognizer(nlp.vocab, entity_types=['ANIMAL'])
ner.update(doc, gold)
p
| You can also provide token-level entity annotation, using the
| following tagging scheme to describe the entity boundaries:
+table([ "Tag", "Description" ])
+row
+cell #[code #[span.u-color-theme B] EGIN]
+cell The first token of a multi-token entity.
+row
+cell #[code #[span.u-color-theme I] N]
+cell An inner token of a multi-token entity.
+row
+cell #[code #[span.u-color-theme L] AST]
+cell The final token of a multi-token entity.
+row
+cell #[code #[span.u-color-theme U] NIT]
+cell A single-token entity.
+row
+cell #[code #[span.u-color-theme O] UT]
+cell A non-entity token.
+aside("Why BILUO, not IOB?")
| There are several coding schemes for encoding entity annotations as
| token tags. These coding schemes are equally expressive, but not
| necessarily equally learnable.
| #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth]
| showed that the minimal #[strong Begin], #[strong In], #[strong Out]
| scheme was more difficult to learn than the #[strong BILUO] scheme that
| we use, which explicitly marks boundary tokens.
p
| spaCy translates the character offsets into this scheme, in order to
| decide the cost of each action given the current state of the entity
| recogniser. The costs are then used to calculate the gradient of the
| loss, to train the model. The exact algorithm is a pastiche of
| well-known methods, and is not currently described in any single
| publication. The model is a greedy transition-based parser guided by a
| linear model whose weights are learned using the averaged perceptron
| loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle]
| imitation learning strategy. The transition system is equivalent to the
| BILOU tagging scheme.