mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge github.com:explosion/spaCy into dutch
This commit is contained in:
commit
88869e0e07
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -93,6 +93,9 @@ coverage.xml
|
|||
# Mac OS X
|
||||
*.DS_Store
|
||||
|
||||
# Temporary files / Dropbox hack
|
||||
*.~*
|
||||
|
||||
# Komodo project files
|
||||
*.komodoproject
|
||||
|
||||
|
|
|
@ -14,9 +14,11 @@ This is a list of everyone who has made significant contributions to spaCy, in a
|
|||
* Kendrick Tan, [@kendricktan](https://github.com/kendricktan)
|
||||
* Kyle P. Johnson, [@kylepjohnson](https://github.com/kylepjohnson)
|
||||
* Liling Tan, [@alvations](https://github.com/alvations)
|
||||
* Mark Amery, [@ExplodingCabbage](https://github.com/ExplodingCabbage)
|
||||
* Matthew Honnibal, [@honnibal](https://github.com/honnibal)
|
||||
* Maxim Samsonov, [@maxirmx](https://github.com/maxirmx)
|
||||
* Oleg Zd, [@olegzd](https://github.com/olegzd)
|
||||
* Pokey Rule, [@pokey](https://github.com/pokey)
|
||||
* Sam Bozek, [@sambozek](https://github.com/sambozek)
|
||||
* Sasho Savkov [@savkov](https://github.com/savkov)
|
||||
* Tiago Rodrigues, [@TiagoMRodrigues](https://github.com/TiagoMRodrigues)
|
||||
|
|
|
@ -100,7 +100,7 @@ def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False
|
|||
nlp.entity(tokens)
|
||||
else:
|
||||
tokens = nlp(raw_text)
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
gold = GoldParse.from_annot_tuples(tokens, annot_tuples)
|
||||
scorer.score(tokens, gold, verbose=verbose)
|
||||
return scorer
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from __future__ import unicode_literals
|
||||
import plac
|
||||
import json
|
||||
from os import path
|
||||
|
@ -5,106 +6,25 @@ import shutil
|
|||
import os
|
||||
import random
|
||||
import io
|
||||
import pathlib
|
||||
|
||||
from spacy.syntax.util import Config
|
||||
from spacy.tokens import Doc
|
||||
from spacy.syntax.nonproj import PseudoProjectivity
|
||||
from spacy.language import Language
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.tagger import Tagger
|
||||
from spacy.syntax.parser import Parser
|
||||
from spacy.syntax.arc_eager import ArcEager
|
||||
from spacy.pipeline import DependencyParser
|
||||
from spacy.syntax.parser import get_templates
|
||||
from spacy.syntax.arc_eager import ArcEager
|
||||
from spacy.scorer import Scorer
|
||||
import spacy.attrs
|
||||
|
||||
from spacy.language import Language
|
||||
|
||||
from spacy.tagger import W_orth
|
||||
|
||||
TAGGER_TEMPLATES = (
|
||||
(W_orth,),
|
||||
)
|
||||
|
||||
try:
|
||||
from codecs import open
|
||||
except ImportError:
|
||||
pass
|
||||
import io
|
||||
|
||||
|
||||
class TreebankParser(object):
|
||||
@staticmethod
|
||||
def setup_model_dir(model_dir, labels, templates, feat_set='basic', seed=0):
|
||||
dep_model_dir = path.join(model_dir, 'deps')
|
||||
pos_model_dir = path.join(model_dir, 'pos')
|
||||
if path.exists(dep_model_dir):
|
||||
shutil.rmtree(dep_model_dir)
|
||||
if path.exists(pos_model_dir):
|
||||
shutil.rmtree(pos_model_dir)
|
||||
os.mkdir(dep_model_dir)
|
||||
os.mkdir(pos_model_dir)
|
||||
|
||||
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
||||
labels=labels)
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, tag_map, model_dir):
|
||||
vocab = Vocab(tag_map=tag_map, get_lex_attr=Language.default_lex_attrs())
|
||||
vocab.get_lex_attr[spacy.attrs.LANG] = lambda _: 0
|
||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
||||
tagger = Tagger.blank(vocab, TAGGER_TEMPLATES)
|
||||
|
||||
cfg = Config.read(path.join(model_dir, 'deps'), 'config')
|
||||
parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings, ArcEager)
|
||||
return cls(vocab, tokenizer, tagger, parser)
|
||||
|
||||
def __init__(self, vocab, tokenizer, tagger, parser):
|
||||
self.vocab = vocab
|
||||
self.tokenizer = tokenizer
|
||||
self.tagger = tagger
|
||||
self.parser = parser
|
||||
|
||||
def train(self, words, tags, heads, deps):
|
||||
tokens = self.tokenizer.tokens_from_list(list(words))
|
||||
self.tagger.train(tokens, tags)
|
||||
|
||||
tokens = self.tokenizer.tokens_from_list(list(words))
|
||||
ids = range(len(words))
|
||||
ner = ['O'] * len(words)
|
||||
gold = GoldParse(tokens, ((ids, words, tags, heads, deps, ner)),
|
||||
make_projective=False)
|
||||
self.tagger(tokens)
|
||||
if gold.is_projective:
|
||||
try:
|
||||
self.parser.train(tokens, gold)
|
||||
except:
|
||||
for id_, word, head, dep in zip(ids, words, heads, deps):
|
||||
print(id_, word, head, dep)
|
||||
raise
|
||||
|
||||
def __call__(self, words, tags=None):
|
||||
tokens = self.tokenizer.tokens_from_list(list(words))
|
||||
if tags is None:
|
||||
self.tagger(tokens)
|
||||
else:
|
||||
self.tagger.tag_from_strings(tokens, tags)
|
||||
self.parser(tokens)
|
||||
return tokens
|
||||
|
||||
def end_training(self, data_dir):
|
||||
self.parser.model.end_training()
|
||||
self.parser.model.dump(path.join(data_dir, 'deps', 'model'))
|
||||
self.tagger.model.end_training()
|
||||
self.tagger.model.dump(path.join(data_dir, 'pos', 'model'))
|
||||
strings_loc = path.join(data_dir, 'vocab', 'strings.json')
|
||||
with io.open(strings_loc, 'w', encoding='utf8') as file_:
|
||||
self.vocab.strings.dump(file_)
|
||||
self.vocab.dump(path.join(data_dir, 'vocab', 'lexemes.bin'))
|
||||
|
||||
|
||||
|
||||
|
||||
def read_conllx(loc):
|
||||
with open(loc, 'r', 'utf8') as file_:
|
||||
with io.open(loc, 'r', encoding='utf8') as file_:
|
||||
text = file_.read()
|
||||
for sent in text.strip().split('\n\n'):
|
||||
lines = sent.strip().split('\n')
|
||||
|
@ -113,24 +33,31 @@ def read_conllx(loc):
|
|||
lines.pop(0)
|
||||
tokens = []
|
||||
for line in lines:
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split()
|
||||
id_, word, lemma, tag, pos, morph, head, dep, _1, _2 = line.split()
|
||||
if '-' in id_:
|
||||
continue
|
||||
id_ = int(id_) - 1
|
||||
head = (int(head) - 1) if head != '0' else id_
|
||||
dep = 'ROOT' if dep == 'root' else dep
|
||||
tokens.append((id_, word, tag, head, dep, 'O'))
|
||||
tuples = zip(*tokens)
|
||||
yield (None, [(tuples, [])])
|
||||
try:
|
||||
id_ = int(id_) - 1
|
||||
head = (int(head) - 1) if head != '0' else id_
|
||||
dep = 'ROOT' if dep == 'root' else dep
|
||||
tokens.append((id_, word, tag, head, dep, 'O'))
|
||||
except:
|
||||
print(line)
|
||||
raise
|
||||
tuples = [list(t) for t in zip(*tokens)]
|
||||
yield (None, [[tuples, []]])
|
||||
|
||||
|
||||
def score_model(nlp, gold_docs, verbose=False):
|
||||
def score_model(vocab, tagger, parser, gold_docs, verbose=False):
|
||||
scorer = Scorer()
|
||||
for _, gold_doc in gold_docs:
|
||||
for annot_tuples, _ in gold_doc:
|
||||
tokens = nlp(list(annot_tuples[1]), tags=list(annot_tuples[2]))
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
scorer.score(tokens, gold, verbose=verbose)
|
||||
for (ids, words, tags, heads, deps, entities), _ in gold_doc:
|
||||
doc = Doc(vocab, words=words)
|
||||
tagger(doc)
|
||||
parser(doc)
|
||||
PseudoProjectivity.deprojectivize(doc)
|
||||
gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
|
||||
scorer.score(doc, gold, verbose=verbose)
|
||||
return scorer
|
||||
|
||||
|
||||
|
@ -138,22 +65,45 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc):
|
|||
with open(tag_map_loc) as file_:
|
||||
tag_map = json.loads(file_.read())
|
||||
train_sents = list(read_conllx(train_loc))
|
||||
labels = ArcEager.get_labels(train_sents)
|
||||
templates = get_templates('basic')
|
||||
train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
|
||||
|
||||
TreebankParser.setup_model_dir(model_dir, labels, templates)
|
||||
actions = ArcEager.get_actions(gold_parses=train_sents)
|
||||
features = get_templates('basic')
|
||||
|
||||
nlp = TreebankParser.from_dir(tag_map, model_dir)
|
||||
model_dir = pathlib.Path(model_dir)
|
||||
with (model_dir / 'deps' / 'config.json').open('w') as file_:
|
||||
json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_)
|
||||
|
||||
vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map)
|
||||
# Populate vocab
|
||||
for _, doc_sents in train_sents:
|
||||
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
|
||||
for word in words:
|
||||
_ = vocab[word]
|
||||
for dep in deps:
|
||||
_ = vocab[dep]
|
||||
for tag in tags:
|
||||
_ = vocab[tag]
|
||||
for tag in tags:
|
||||
assert tag in tag_map, repr(tag)
|
||||
tagger = Tagger(vocab, tag_map=tag_map)
|
||||
parser = DependencyParser(vocab, actions=actions, features=features)
|
||||
|
||||
for itn in range(15):
|
||||
for _, doc_sents in train_sents:
|
||||
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
|
||||
nlp.train(words, tags, heads, deps)
|
||||
doc = Doc(vocab, words=words)
|
||||
gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
|
||||
tagger(doc)
|
||||
parser.update(doc, gold)
|
||||
doc = Doc(vocab, words=words)
|
||||
tagger.update(doc, gold)
|
||||
random.shuffle(train_sents)
|
||||
scorer = score_model(nlp, read_conllx(dev_loc))
|
||||
scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
|
||||
print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc))
|
||||
nlp = Language(vocab=vocab, tagger=tagger, parser=parser)
|
||||
nlp.end_training(model_dir)
|
||||
scorer = score_model(nlp, read_conllx(dev_loc))
|
||||
scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
|
||||
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
|
||||
|
||||
|
||||
|
|
|
@ -86,5 +86,48 @@ IDS = {
|
|||
"LANG": LANG,
|
||||
}
|
||||
|
||||
|
||||
# ATTR IDs, in order of the symbol
|
||||
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
||||
|
||||
|
||||
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||
'''Normalize a dictionary of attributes, converting them to ints.
|
||||
|
||||
Arguments:
|
||||
stringy_attrs (dict):
|
||||
Dictionary keyed by attribute string names. Values can be ints or strings.
|
||||
|
||||
strings_map (StringStore):
|
||||
Defaults to None. If provided, encodes string values into ints.
|
||||
|
||||
Returns:
|
||||
inty_attrs (dict):
|
||||
Attributes dictionary with keys and optionally values converted to
|
||||
ints.
|
||||
'''
|
||||
inty_attrs = {}
|
||||
if _do_deprecated:
|
||||
if 'F' in stringy_attrs:
|
||||
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
|
||||
if 'L' in stringy_attrs:
|
||||
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
|
||||
if 'pos' in stringy_attrs:
|
||||
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
||||
if 'morph' in stringy_attrs:
|
||||
morphs = stringy_attrs.pop('morph')
|
||||
if 'number' in stringy_attrs:
|
||||
stringy_attrs.pop('number')
|
||||
if 'tenspect' in stringy_attrs:
|
||||
stringy_attrs.pop('tenspect')
|
||||
# for name, value in morphs.items():
|
||||
# stringy_attrs[name] = value
|
||||
for name, value in stringy_attrs.items():
|
||||
if isinstance(name, int):
|
||||
int_key = name
|
||||
else:
|
||||
int_key = IDS[name.upper()]
|
||||
if strings_map is not None and isinstance(value, basestring):
|
||||
value = strings_map[value]
|
||||
inty_attrs[int_key] = value
|
||||
return inty_attrs
|
||||
|
|
|
@ -19,6 +19,7 @@ cdef class GoldParse:
|
|||
|
||||
cdef int length
|
||||
cdef readonly int loss
|
||||
cdef readonly list words
|
||||
cdef readonly list tags
|
||||
cdef readonly list heads
|
||||
cdef readonly list labels
|
||||
|
|
|
@ -19,6 +19,8 @@ def tags_to_entities(tags):
|
|||
entities = []
|
||||
start = None
|
||||
for i, tag in enumerate(tags):
|
||||
if tag is None:
|
||||
continue
|
||||
if tag.startswith('O'):
|
||||
# TODO: We shouldn't be getting these malformed inputs. Fix this.
|
||||
if start is not None:
|
||||
|
@ -249,7 +251,7 @@ cdef class GoldParse:
|
|||
if deps is None:
|
||||
deps = [None for _ in doc]
|
||||
if entities is None:
|
||||
entities = [None for _ in doc]
|
||||
entities = ['-' for _ in doc]
|
||||
elif len(entities) == 0:
|
||||
entities = ['O' for _ in doc]
|
||||
elif not isinstance(entities[0], basestring):
|
||||
|
@ -266,6 +268,7 @@ cdef class GoldParse:
|
|||
self.c.labels = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
||||
|
||||
self.words = [None] * len(doc)
|
||||
self.tags = [None] * len(doc)
|
||||
self.heads = [None] * len(doc)
|
||||
self.labels = [''] * len(doc)
|
||||
|
@ -279,6 +282,7 @@ cdef class GoldParse:
|
|||
|
||||
for i, gold_i in enumerate(self.cand_to_gold):
|
||||
if doc[i].text.isspace():
|
||||
self.words[i] = doc[i].text
|
||||
self.tags[i] = 'SP'
|
||||
self.heads[i] = None
|
||||
self.labels[i] = None
|
||||
|
@ -286,6 +290,7 @@ cdef class GoldParse:
|
|||
if gold_i is None:
|
||||
pass
|
||||
else:
|
||||
self.words[i] = words[gold_i]
|
||||
self.tags[i] = tags[gold_i]
|
||||
self.heads[i] = self.gold_to_cand[heads[gold_i]]
|
||||
self.labels[i] = deps[gold_i]
|
||||
|
|
|
@ -5,10 +5,7 @@ import pathlib
|
|||
from contextlib import contextmanager
|
||||
import shutil
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError:
|
||||
import json
|
||||
import ujson as json
|
||||
|
||||
|
||||
try:
|
||||
|
@ -31,6 +28,8 @@ from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP
|
|||
from .syntax.parser import get_templates
|
||||
from .syntax.nonproj import PseudoProjectivity
|
||||
from .pipeline import DependencyParser, EntityRecognizer
|
||||
from .syntax.arc_eager import ArcEager
|
||||
from .syntax.ner import BiluoPushDown
|
||||
|
||||
|
||||
class BaseDefaults(object):
|
||||
|
@ -80,7 +79,7 @@ class BaseDefaults(object):
|
|||
else:
|
||||
infix_finditer = None
|
||||
vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||
return Tokenizer(nlp.vocab, rules=rules,
|
||||
return Tokenizer(vocab, rules=rules,
|
||||
prefix_search=prefix_search, suffix_search=suffix_search,
|
||||
infix_finditer=infix_finditer)
|
||||
|
||||
|
@ -96,26 +95,27 @@ class BaseDefaults(object):
|
|||
return Tagger.load(nlp.path / 'pos', nlp.vocab)
|
||||
|
||||
@classmethod
|
||||
def create_parser(cls, nlp=None):
|
||||
def create_parser(cls, nlp=None, **cfg):
|
||||
if nlp is None:
|
||||
return DependencyParser(cls.create_vocab(), features=cls.parser_features)
|
||||
return DependencyParser(cls.create_vocab(), features=cls.parser_features,
|
||||
**cfg)
|
||||
elif nlp.path is False:
|
||||
return DependencyParser(nlp.vocab, features=cls.parser_features)
|
||||
return DependencyParser(nlp.vocab, features=cls.parser_features, **cfg)
|
||||
elif nlp.path is None or not (nlp.path / 'deps').exists():
|
||||
return None
|
||||
else:
|
||||
return DependencyParser.load(nlp.path / 'deps', nlp.vocab)
|
||||
return DependencyParser.load(nlp.path / 'deps', nlp.vocab, **cfg)
|
||||
|
||||
@classmethod
|
||||
def create_entity(cls, nlp=None):
|
||||
def create_entity(cls, nlp=None, **cfg):
|
||||
if nlp is None:
|
||||
return EntityRecognizer(cls.create_vocab(), features=cls.entity_features)
|
||||
return EntityRecognizer(cls.create_vocab(), features=cls.entity_features, **cfg)
|
||||
elif nlp.path is False:
|
||||
return EntityRecognizer(nlp.vocab, features=cls.entity_features)
|
||||
return EntityRecognizer(nlp.vocab, features=cls.entity_features, **cfg)
|
||||
elif nlp.path is None or not (nlp.path / 'ner').exists():
|
||||
return None
|
||||
else:
|
||||
return EntityRecognizer.load(nlp.path / 'ner', nlp.vocab)
|
||||
return EntityRecognizer.load(nlp.path / 'ner', nlp.vocab, **cfg)
|
||||
|
||||
@classmethod
|
||||
def create_matcher(cls, nlp=None):
|
||||
|
@ -216,14 +216,14 @@ class Language(object):
|
|||
# preprocess training data here before ArcEager.get_labels() is called
|
||||
gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
|
||||
|
||||
parser_cfg['labels'] = ArcEager.get_labels(gold_tuples)
|
||||
entity_cfg['labels'] = BiluoPushDown.get_labels(gold_tuples)
|
||||
parser_cfg['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
|
||||
entity_cfg['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)
|
||||
|
||||
with (dep_model_dir / 'config.json').open('wb') as file_:
|
||||
with (dep_model_dir / 'config.json').open('w') as file_:
|
||||
json.dump(parser_cfg, file_)
|
||||
with (ner_model_dir / 'config.json').open('wb') as file_:
|
||||
with (ner_model_dir / 'config.json').open('w') as file_:
|
||||
json.dump(entity_cfg, file_)
|
||||
with (pos_model_dir / 'config.json').open('wb') as file_:
|
||||
with (pos_model_dir / 'config.json').open('w') as file_:
|
||||
json.dump(tagger_cfg, file_)
|
||||
|
||||
self = cls(
|
||||
|
@ -238,15 +238,12 @@ class Language(object):
|
|||
vectors=False,
|
||||
pipeline=False)
|
||||
|
||||
self.defaults.parser_labels = parser_cfg['labels']
|
||||
self.defaults.entity_labels = entity_cfg['labels']
|
||||
|
||||
self.vocab = self.defaults.Vocab()
|
||||
self.tokenizer = self.defaults.Tokenizer(self.vocab)
|
||||
self.tagger = self.defaults.Tagger(self.vocab, **tagger_cfg)
|
||||
self.parser = self.defaults.Parser(self.vocab, **parser_cfg)
|
||||
self.entity = self.defaults.Entity(self.vocab, **entity_cfg)
|
||||
self.pipeline = self.defaults.Pipeline(self)
|
||||
self.vocab = self.Defaults.create_vocab(self)
|
||||
self.tokenizer = self.Defaults.create_tokenizer(self)
|
||||
self.tagger = self.Defaults.create_tagger(self)
|
||||
self.parser = self.Defaults.create_parser(self)
|
||||
self.entity = self.Defaults.create_entity(self)
|
||||
self.pipeline = self.Defaults.create_pipeline(self)
|
||||
yield Trainer(self, gold_tuples)
|
||||
self.end_training()
|
||||
|
||||
|
@ -267,7 +264,7 @@ class Language(object):
|
|||
add_vectors = self.Defaults.add_vectors(self) \
|
||||
if 'add_vectors' not in overrides \
|
||||
else overrides['add_vectors']
|
||||
if add_vectors:
|
||||
if self.vocab and add_vectors:
|
||||
add_vectors(self.vocab)
|
||||
self.tokenizer = self.Defaults.create_tokenizer(self) \
|
||||
if 'tokenizer' not in overrides \
|
||||
|
@ -387,7 +384,7 @@ class Language(object):
|
|||
else:
|
||||
entity_iob_freqs = []
|
||||
entity_type_freqs = []
|
||||
with (path / 'vocab' / 'serializer.json').open('wb') as file_:
|
||||
with (path / 'vocab' / 'serializer.json').open('w') as file_:
|
||||
file_.write(
|
||||
json.dumps([
|
||||
(TAG, tagger_freqs),
|
||||
|
|
|
@ -87,7 +87,7 @@ class Scorer(object):
|
|||
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
|
||||
for id_, word, tag, head, dep, ner in gold.orig_annot:
|
||||
gold_tags.add((id_, tag))
|
||||
if dep.lower() not in punct_labels:
|
||||
if dep is not None and dep.lower() not in punct_labels:
|
||||
gold_deps.add((id_, head, dep.lower()))
|
||||
cand_deps = set()
|
||||
cand_tags = set()
|
||||
|
|
|
@ -439,7 +439,7 @@ cdef class ArcEager(TransitionSystem):
|
|||
if move_costs[move] == -1:
|
||||
move_costs[move] = move_cost_funcs[move](stcls, &gold.c)
|
||||
costs[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label)
|
||||
n_gold += costs[i] == 0
|
||||
n_gold += costs[i] <= 0
|
||||
else:
|
||||
is_valid[i] = False
|
||||
costs[i] = 9000
|
||||
|
@ -456,8 +456,14 @@ cdef class ArcEager(TransitionSystem):
|
|||
"before training and after parsing. Either pass make_projective=True "
|
||||
"to the GoldParse class, or use PseudoProjectivity.preprocess_training_data")
|
||||
else:
|
||||
print(gold.words)
|
||||
print(gold.heads)
|
||||
print(gold.labels)
|
||||
raise ValueError(
|
||||
"Could not find a gold-standard action to supervise the dependency "
|
||||
"parser.\n"
|
||||
"The GoldParse was projective.")
|
||||
"The GoldParse was projective.\n"
|
||||
"The transition system has %d actions.\n"
|
||||
"State at failure:\n"
|
||||
"%s" % (self.n_moves, stcls.print_state(gold.words)))
|
||||
assert n_gold >= 1
|
||||
|
|
|
@ -1,13 +1,16 @@
|
|||
from spacy.parts_of_speech cimport NOUN, PROPN, PRON
|
||||
|
||||
|
||||
def english_noun_chunks(doc):
|
||||
def english_noun_chunks(obj):
|
||||
'''Detect base noun phrases from a dependency parse.
|
||||
Works on both Doc and Span.'''
|
||||
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
|
||||
'attr', 'ROOT', 'root']
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings['conj']
|
||||
np_label = doc.vocab.strings['NP']
|
||||
for i, word in enumerate(doc):
|
||||
for i, word in enumerate(obj):
|
||||
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
||||
yield word.left_edge.i, word.i+1, np_label
|
||||
elif word.pos == NOUN and word.dep == conj:
|
||||
|
@ -25,14 +28,15 @@ def english_noun_chunks(doc):
|
|||
# extended to the right of the NOUN
|
||||
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
|
||||
# just "eine Tasse", same for "das Thema Familie"
|
||||
def german_noun_chunks(doc):
|
||||
def german_noun_chunks(obj):
|
||||
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
np_label = doc.vocab.strings['NP']
|
||||
np_deps = set(doc.vocab.strings[label] for label in labels)
|
||||
close_app = doc.vocab.strings['nk']
|
||||
|
||||
rbracket = 0
|
||||
for i, word in enumerate(doc):
|
||||
for i, word in enumerate(obj):
|
||||
if i < rbracket:
|
||||
continue
|
||||
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
||||
|
|
|
@ -65,7 +65,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
for action in (BEGIN, IN, LAST, UNIT):
|
||||
actions[action][entity_type] = True
|
||||
moves = ('M', 'B', 'I', 'L', 'U')
|
||||
for raw_text, sents in kwargs.get('gold_tuples', []):
|
||||
for raw_text, sents in kwargs.get('gold_parses', []):
|
||||
for (ids, words, tags, heads, labels, biluo), _ in sents:
|
||||
for i, ner_tag in enumerate(biluo):
|
||||
if ner_tag != 'O' and ner_tag != '-':
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from __future__ import unicode_literals
|
||||
from copy import copy
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
|
|
|
@ -76,7 +76,7 @@ cdef class ParserModel(AveragedPerceptron):
|
|||
cdef class Parser:
|
||||
"""Base class of the DependencyParser and EntityRecognizer."""
|
||||
@classmethod
|
||||
def load(cls, path, Vocab vocab, TransitionSystem=None, require=False):
|
||||
def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg):
|
||||
"""Load the statistical model from the supplied path.
|
||||
|
||||
Arguments:
|
||||
|
@ -92,7 +92,7 @@ cdef class Parser:
|
|||
with (path / 'config.json').open() as file_:
|
||||
cfg = json.load(file_)
|
||||
# TODO: remove this shim when we don't have to support older data
|
||||
if 'labels' in cfg:
|
||||
if 'labels' in cfg and 'actions' not in cfg:
|
||||
cfg['actions'] = cfg.pop('labels')
|
||||
self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg)
|
||||
if (path / 'model').exists():
|
||||
|
@ -266,7 +266,7 @@ cdef class Parser:
|
|||
loss += eg.costs[eg.guess]
|
||||
eg.fill_scores(0, eg.nr_class)
|
||||
eg.fill_costs(0, eg.nr_class)
|
||||
eg.fill_is_valid(0, eg.nr_class)
|
||||
eg.fill_is_valid(1, eg.nr_class)
|
||||
return loss
|
||||
|
||||
def step_through(self, Doc doc):
|
||||
|
|
32
spacy/tests/unit/test_attrs.py
Normal file
32
spacy/tests/unit/test_attrs.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
from ...attrs import *
|
||||
|
||||
|
||||
def test_key_no_value():
|
||||
int_attrs = intify_attrs({"ORTH": "dog"})
|
||||
assert int_attrs == {ORTH: "dog"}
|
||||
|
||||
|
||||
def test_lower_key():
|
||||
int_attrs = intify_attrs({"norm": "dog"})
|
||||
assert int_attrs == {NORM: "dog"}
|
||||
|
||||
|
||||
|
||||
def test_lower_key_value():
|
||||
vals = {'dog': 10}
|
||||
int_attrs = intify_attrs({"lemma": "dog"}, strings_map=vals)
|
||||
assert int_attrs == {LEMMA: 10}
|
||||
|
||||
|
||||
def test_idempotence():
|
||||
vals = {'dog': 10}
|
||||
int_attrs = intify_attrs({"lemma": "dog", 'is_alpha': True}, strings_map=vals)
|
||||
int_attrs = intify_attrs(int_attrs)
|
||||
assert int_attrs == {LEMMA: 10, IS_ALPHA: True}
|
||||
|
||||
|
||||
def test_do_deprecated():
|
||||
vals = {'dog': 10}
|
||||
int_attrs = intify_attrs({"F": "dog", 'is_alpha': True}, strings_map=vals,
|
||||
_do_deprecated=True)
|
||||
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
|
48
spacy/tests/unit/test_tokenizer.py
Normal file
48
spacy/tests/unit/test_tokenizer.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
import re
|
||||
|
||||
from ...vocab import Vocab
|
||||
from ...tokenizer import Tokenizer
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vocab():
|
||||
return Vocab(tag_map={'NN': {'pos': 'NOUN'}})
|
||||
|
||||
@pytest.fixture
|
||||
def rules():
|
||||
return {}
|
||||
|
||||
@pytest.fixture
|
||||
def prefix_search():
|
||||
return None
|
||||
|
||||
@pytest.fixture
|
||||
def suffix_search():
|
||||
return None
|
||||
|
||||
@pytest.fixture
|
||||
def infix_finditer():
|
||||
return None
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tokenizer(vocab, rules, prefix_search, suffix_search, infix_finditer):
|
||||
return Tokenizer(vocab, rules, prefix_search, suffix_search, infix_finditer)
|
||||
|
||||
|
||||
def test_add_special_case(tokenizer):
|
||||
tokenizer.add_special_case('dog', [{'orth': 'd'}, {'orth': 'og'}])
|
||||
doc = tokenizer('dog')
|
||||
assert doc[0].text == 'd'
|
||||
assert doc[1].text == 'og'
|
||||
|
||||
|
||||
def test_special_case_tag(tokenizer):
|
||||
tokenizer.add_special_case('dog', [{'orth': 'd', 'tag': 'NN'}, {'orth': 'og'}])
|
||||
doc = tokenizer('dog')
|
||||
assert doc[0].text == 'd'
|
||||
assert doc[0].tag_ == 'NN'
|
||||
assert doc[0].pos_ == 'NOUN'
|
||||
assert doc[1].text == 'og'
|
|
@ -223,6 +223,10 @@ cdef class Doc:
|
|||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
@property
|
||||
def doc(self):
|
||||
return self
|
||||
|
||||
def similarity(self, other):
|
||||
'''Make a semantic similarity estimate. The default estimate is cosine
|
||||
similarity using an average of word vectors.
|
||||
|
|
|
@ -190,6 +190,31 @@ cdef class Span:
|
|||
def __get__(self):
|
||||
return u''.join([t.text_with_ws for t in self])
|
||||
|
||||
property noun_chunks:
|
||||
'''
|
||||
Yields base noun-phrase #[code Span] objects, if the document
|
||||
has been syntactically parsed. A base noun phrase, or
|
||||
'NP chunk', is a noun phrase that does not permit other NPs to
|
||||
be nested within it – so no NP-level coordination, no prepositional
|
||||
phrases, and no relative clauses. For example:
|
||||
'''
|
||||
def __get__(self):
|
||||
if not self.doc.is_parsed:
|
||||
raise ValueError(
|
||||
"noun_chunks requires the dependency parse, which "
|
||||
"requires data to be installed. If you haven't done so, run: "
|
||||
"\npython -m spacy.%s.download all\n"
|
||||
"to install the data" % self.vocab.lang)
|
||||
# Accumulate the result before beginning to iterate over it. This prevents
|
||||
# the tokenisation from being changed out from under us during the iteration.
|
||||
# The tricky thing here is that Span accepts its tokenisation changing,
|
||||
# so it's okay once we have the Span objects. See Issue #375
|
||||
spans = []
|
||||
for start, end, label in self.doc.noun_chunks_iterator(self):
|
||||
spans.append(Span(self, start, end, label=label))
|
||||
for span in spans:
|
||||
yield span
|
||||
|
||||
property root:
|
||||
"""The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered.
|
||||
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
from numpy cimport ndarray
|
||||
from ..vocab cimport Vocab
|
||||
from ..structs cimport TokenC
|
||||
from ..attrs cimport attr_id_t
|
||||
from ..attrs cimport *
|
||||
from ..typedefs cimport attr_t, flags_t
|
||||
from ..parts_of_speech cimport univ_pos_t
|
||||
from .doc cimport Doc
|
||||
from ..lexeme cimport Lexeme
|
||||
|
||||
|
||||
cdef class Token:
|
||||
|
@ -22,4 +25,51 @@ cdef class Token:
|
|||
doc._py_tokens[offset] = self
|
||||
return self
|
||||
|
||||
#cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
|
||||
# cdef TokenC token
|
||||
# attrs = normalize_attrs(attrs)
|
||||
|
||||
cpdef bint check_flag(self, attr_id_t flag_id) except -1
|
||||
|
||||
@staticmethod
|
||||
cdef inline attr_t get_struct_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
||||
if feat_name < (sizeof(flags_t) * 8):
|
||||
return Lexeme.c_check_flag(token.lex, feat_name)
|
||||
elif feat_name == LEMMA:
|
||||
return token.lemma
|
||||
elif feat_name == POS:
|
||||
return token.pos
|
||||
elif feat_name == TAG:
|
||||
return token.tag
|
||||
elif feat_name == DEP:
|
||||
return token.dep
|
||||
elif feat_name == HEAD:
|
||||
return token.head
|
||||
elif feat_name == SPACY:
|
||||
return token.spacy
|
||||
elif feat_name == ENT_IOB:
|
||||
return token.ent_iob
|
||||
elif feat_name == ENT_TYPE:
|
||||
return token.ent_type
|
||||
else:
|
||||
return Lexeme.get_struct_attr(token.lex, feat_name)
|
||||
|
||||
@staticmethod
|
||||
cdef inline attr_t set_struct_attr(TokenC* token, attr_id_t feat_name,
|
||||
attr_t value) nogil:
|
||||
if feat_name == LEMMA:
|
||||
token.lemma = value
|
||||
elif feat_name == POS:
|
||||
token.pos = <univ_pos_t>value
|
||||
elif feat_name == TAG:
|
||||
token.tag = value
|
||||
elif feat_name == DEP:
|
||||
token.dep = value
|
||||
elif feat_name == HEAD:
|
||||
token.head = value
|
||||
elif feat_name == SPACY:
|
||||
token.spacy = value
|
||||
elif feat_name == ENT_IOB:
|
||||
token.ent_iob = value
|
||||
elif feat_name == ENT_TYPE:
|
||||
token.ent_type = value
|
||||
|
|
|
@ -14,22 +14,31 @@ class Trainer(object):
|
|||
self.gold_tuples = gold_tuples
|
||||
|
||||
def epochs(self, nr_epoch, augment_data=None, gold_preproc=False):
|
||||
def _epoch():
|
||||
for raw_text, paragraph_tuples in self.gold_tuples:
|
||||
cached_golds = {}
|
||||
def _epoch(indices):
|
||||
for i in indices:
|
||||
raw_text, paragraph_tuples = self.gold_tuples[i]
|
||||
if gold_preproc:
|
||||
raw_text = None
|
||||
else:
|
||||
paragraph_tuples = merge_sents(paragraph_tuples)
|
||||
if augment_data is not None:
|
||||
if augment_data is None:
|
||||
docs = self.make_docs(raw_text, paragraph_tuples)
|
||||
if i in cached_golds:
|
||||
golds = cached_golds[i]
|
||||
else:
|
||||
golds = self.make_golds(docs, paragraph_tuples)
|
||||
else:
|
||||
raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples)
|
||||
docs = self.make_docs(raw_text, paragraph_tuples)
|
||||
golds = self.make_golds(docs, paragraph_tuples)
|
||||
docs = self.make_docs(raw_text, paragraph_tuples)
|
||||
golds = self.make_golds(docs, paragraph_tuples)
|
||||
for doc, gold in zip(docs, golds):
|
||||
yield doc, gold
|
||||
|
||||
indices = list(range(len(self.gold_tuples)))
|
||||
for itn in range(nr_epoch):
|
||||
random.shuffle(self.gold_tuples)
|
||||
yield _epoch()
|
||||
random.shuffle(indices)
|
||||
yield _epoch(indices)
|
||||
|
||||
def update(self, doc, gold):
|
||||
for process in self.nlp.pipeline:
|
||||
|
@ -62,8 +71,8 @@ class Trainer(object):
|
|||
|
||||
def make_golds(self, docs, paragraph_tuples):
|
||||
if len(docs) == 1:
|
||||
return [GoldParse(docs[0], sent_tuples[0])
|
||||
return [GoldParse.from_annot_tuples(docs[0], sent_tuples[0])
|
||||
for sent_tuples in paragraph_tuples]
|
||||
else:
|
||||
return [GoldParse(doc, sent_tuples[0])
|
||||
return [GoldParse.from_annot_tuples(doc, sent_tuples[0])
|
||||
for doc, sent_tuples in zip(docs, paragraph_tuples)]
|
||||
|
|
|
@ -20,6 +20,8 @@ from .orth cimport word_shape
|
|||
from .typedefs cimport attr_t
|
||||
from .cfile cimport CFile
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .attrs import intify_attrs
|
||||
from .tokens.token cimport Token
|
||||
|
||||
from . import attrs
|
||||
from . import symbols
|
||||
|
@ -336,16 +338,14 @@ cdef class Vocab:
|
|||
cdef int i
|
||||
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
||||
for i, props in enumerate(substrings):
|
||||
props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True)
|
||||
token = &tokens[i]
|
||||
# Set the special tokens up to have morphology and lemmas if
|
||||
# specified, otherwise use the part-of-speech tag (if specified)
|
||||
token.lex = <LexemeC*>self.get(self.mem, props['F'])
|
||||
if 'pos' in props:
|
||||
self.morphology.assign_tag(token, props['pos'])
|
||||
if 'L' in props:
|
||||
tokens[i].lemma = self.strings[props['L']]
|
||||
for feature, value in props.get('morph', {}).items():
|
||||
self.morphology.assign_feature(&token.morph, feature, value)
|
||||
# Set the special tokens up to have arbitrary attributes
|
||||
token.lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH])
|
||||
if attrs.TAG in props:
|
||||
self.morphology.assign_tag(token, props[attrs.TAG])
|
||||
for attr_id, value in props.items():
|
||||
Token.set_struct_attr(token, attr_id, value)
|
||||
return tokens
|
||||
|
||||
def dump(self, loc):
|
||||
|
|
|
@ -21,7 +21,8 @@
|
|||
"SOCIAL": {
|
||||
"twitter": "spacy_io",
|
||||
"github": "explosion",
|
||||
"reddit": "spacynlp"
|
||||
"reddit": "spacynlp",
|
||||
"codepen": "explosion"
|
||||
},
|
||||
|
||||
"NAVIGATION": {
|
||||
|
|
|
@ -90,6 +90,19 @@ mixin code(label, language)
|
|||
block
|
||||
|
||||
|
||||
//- CodePen embed
|
||||
slug - [string] ID of CodePen demo (taken from URL)
|
||||
height - [integer] height of demo embed iframe
|
||||
default_tab - [string] code tab(s) visible on load (default: "result")
|
||||
|
||||
mixin codepen(slug, height, default_tab)
|
||||
figure.o-block(style="min-height: #{height}px")&attributes(attributes)
|
||||
.codepen(data-height=height data-theme-id="26467" data-slug-hash=slug data-default-tab=(default_tab || "result") data-embed-version="2" data-user=SOCIAL.codepen)
|
||||
+a("https://codepen.io/" + SOCIAL.codepen + "/" + slug) View on CodePen
|
||||
|
||||
script(async src="https://assets.codepen.io/assets/embed/ei.js")
|
||||
|
||||
|
||||
//- Images / figures
|
||||
url - [string] url or path to image
|
||||
width - [integer] image width in px, for better rendering (default: 500)
|
||||
|
|
|
@ -71,7 +71,7 @@ p
|
|||
|
||||
+h(2, "named-entities") Named Entity Recognition
|
||||
|
||||
+table(["Entity Type", "Description"])
|
||||
+table([ "Type", "Description" ])
|
||||
+row
|
||||
+cell #[code PERSON]
|
||||
+cell People, including fictional.
|
||||
|
@ -81,8 +81,8 @@ p
|
|||
+cell Nationalities or religious or political groups.
|
||||
|
||||
+row
|
||||
+cell #[code FAC]
|
||||
+cell Facilities, such as buildings, airports, highways, bridges, etc.
|
||||
+cell #[code FACILITY]
|
||||
+cell Buildings, airports, highways, bridges, etc.
|
||||
|
||||
+row
|
||||
+cell #[code ORG]
|
||||
|
@ -98,7 +98,7 @@ p
|
|||
|
||||
+row
|
||||
+cell #[code PRODUCT]
|
||||
+cell Vehicles, weapons, foods, etc. (Not services)
|
||||
+cell Objects, vehicles, foods, etc. (Not services.)
|
||||
|
||||
+row
|
||||
+cell #[code EVENT]
|
||||
|
@ -108,41 +108,37 @@ p
|
|||
+cell #[code WORK_OF_ART]
|
||||
+cell Titles of books, songs, etc.
|
||||
|
||||
+row
|
||||
+cell #[code LAW]
|
||||
+cell Named documents made into laws
|
||||
|
||||
+row
|
||||
+cell #[code LANGUAGE]
|
||||
+cell Any named language
|
||||
+cell Any named language.
|
||||
|
||||
p The following values are also annotated in a style similar to names:
|
||||
|
||||
+table(["Entity Type", "Description"])
|
||||
+table([ "Type", "Description" ])
|
||||
+row
|
||||
+cell #[code DATE]
|
||||
+cell Absolute or relative dates or periods
|
||||
+cell Absolute or relative dates or periods.
|
||||
|
||||
+row
|
||||
+cell #[code TIME]
|
||||
+cell Times smaller than a day
|
||||
+cell Times smaller than a day.
|
||||
|
||||
+row
|
||||
+cell #[code PERCENT]
|
||||
+cell Percentage (including “%”)
|
||||
+cell Percentage, including "%".
|
||||
|
||||
+row
|
||||
+cell #[code MONEY]
|
||||
+cell Monetary values, including unit
|
||||
+cell Monetary values, including unit.
|
||||
|
||||
+row
|
||||
+cell #[code QUANTITY]
|
||||
+cell Measurements, as of weight or distance
|
||||
+cell Measurements, as of weight or distance.
|
||||
|
||||
+row
|
||||
+cell #[code ORDINAL]
|
||||
+cell "first", "second"
|
||||
+cell "first", "second", etc.
|
||||
|
||||
+row
|
||||
+cell #[code CARDINAL]
|
||||
+cell Numerals that do not fall under another type
|
||||
+cell Numerals that do not fall under another type.
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
"Processing text": "processing-text",
|
||||
"spaCy's data model": "data-model",
|
||||
"Using the parse": "dependency-parse",
|
||||
"Entity recognition": "entity-recognition",
|
||||
"Custom pipelines": "customizing-pipeline",
|
||||
"Rule-based matching": "rule-based-matching",
|
||||
"Word vectors": "word-vectors-similarities",
|
||||
|
@ -51,7 +52,13 @@
|
|||
},
|
||||
|
||||
"dependency-parse": {
|
||||
"title": "Using the dependency parse"
|
||||
"title": "Using the dependency parse",
|
||||
"next": "entity-recognition"
|
||||
},
|
||||
|
||||
"entity-recognition": {
|
||||
"title": "Entity recognition",
|
||||
"next": "rule-based-matching"
|
||||
},
|
||||
|
||||
"rule-based-matching": {
|
||||
|
@ -232,6 +239,12 @@
|
|||
},
|
||||
|
||||
"deep_dives": {
|
||||
"Modern NLP in Python – What you can learn about food by analyzing a million Yelp reviews": {
|
||||
"url": "http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb",
|
||||
"author": "Patrick Harrison (S&P Global)",
|
||||
"tags": [ "jupyter", "gensim" ]
|
||||
},
|
||||
|
||||
"Deep Learning with custom pipelines and Keras": {
|
||||
"url": "https://explosion.ai/blog/spacy-deep-learning-keras",
|
||||
"author": "Matthew Honnibal",
|
||||
|
|
290
website/docs/usage/entity-recognition.jade
Normal file
290
website/docs/usage/entity-recognition.jade
Normal file
|
@ -0,0 +1,290 @@
|
|||
//- 💫 DOCS > USAGE > NAMED ENTITY RECOGNITION
|
||||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
p
|
||||
| spaCy features an extremely fast statistical entity recognition system,
|
||||
| that assigns labels to contiguous spans of tokens. The default model
|
||||
| identifies a variety of named and numeric entities, including companies,
|
||||
| locations, organizations and products. You can add arbitrary classes to
|
||||
| the entity recognition system, and update the model with new examples.
|
||||
|
||||
+aside-code("Example").
|
||||
import spacy
|
||||
nlp = spacy.load('en')
|
||||
doc = nlp(u'London is a big city in the United Kingdom.')
|
||||
for ent in doc.ents:
|
||||
print(ent.label_, ent.text)
|
||||
# GPE London
|
||||
# GPE United Kingdom
|
||||
|
||||
p
|
||||
| The standard way to access entity annotations is the
|
||||
| #[+api("doc#ents") #[code doc.ents]] property, which produces a sequence
|
||||
| of #[+api("span") #[code Span]] objects. The entity type is accessible
|
||||
| either as an integer ID or as a string, using the attributes
|
||||
| #[code ent.label] and #[code ent.label_]. The #[code Span] object acts
|
||||
| as a sequence of tokens, so you can iterate over the entity or index into
|
||||
| it. You can also get the text form of the whole entity, as though it were
|
||||
| a single token. See the #[+api("span") API reference] for more details.
|
||||
|
||||
p
|
||||
| You can access token entity annotations using the #[code token.ent_iob]
|
||||
| and #[code token.ent_type] attributes. The #[code token.ent_iob]
|
||||
| attribute indicates whether an entity starts, continues or ends on the
|
||||
| tag (In, Begin, Out).
|
||||
|
||||
+code("Example").
|
||||
doc = nlp(u'London is a big city in the United Kingdom.')
|
||||
print(doc[0].text, doc[0].ent_iob, doc[0].ent_type_)
|
||||
# (u'London', 2, u'GPE')
|
||||
print(doc[1].text, doc[1].ent_iob, doc[1].ent_type_)
|
||||
# (u'is', 3, u'')
|
||||
|
||||
+h(2, "setting") Setting entity annotations
|
||||
|
||||
p
|
||||
| To ensure that the sequence of token annotations remains consistent, you
|
||||
| have to set entity annotations at the document level — you can't write
|
||||
| directly to the #[code token.ent_iob] or #[code token.ent_type]
|
||||
| attributes. The easiest way to set entities is to assign to the
|
||||
| #[code doc.ents] attribute.
|
||||
|
||||
+code("Example").
|
||||
doc = nlp(u'London is a big city in the United Kingdom.')
|
||||
doc.ents = []
|
||||
assert doc[0].ent_type_ == ''
|
||||
doc.ents = [Span(0, 1, label='GPE')]
|
||||
assert doc[0].ent_type_ == 'GPE'
|
||||
doc.ents = []
|
||||
doc.ents = [(u'LondonCity', 0, 1, u'GPE')]
|
||||
|
||||
p
|
||||
| The value you assign should be a sequence, the values of which
|
||||
| can either be #[code Span] objects, or #[code (ent_id, ent_type, start, end)]
|
||||
| tuples, where #[code start] and #[code end] are token offsets that
|
||||
| describe the slice of the document that should be annotated.
|
||||
|
||||
p
|
||||
| You can also assign entity annotations using the #[code doc.from_array()]
|
||||
| method. To do this, you should include both the #[code ENT_TYPE] and the
|
||||
| #[code ENT_IOB] attributes in the array you're importing from.
|
||||
|
||||
+code("Example").
|
||||
from spacy.attrs import ENT_IOB, ENT_TYPE
|
||||
import numpy
|
||||
|
||||
doc = nlp.make_doc(u'London is a big city in the United Kingdom.')
|
||||
assert list(doc.ents) == []
|
||||
header = [ENT_IOB, ENT_TYPE]
|
||||
attr_array = numpy.zeros((len(doc), len(header)))
|
||||
attr_array[0, 0] = 2 # B
|
||||
attr_array[0, 1] = doc.vocab.strings[u'GPE']
|
||||
doc.from_array(header, attr_array)
|
||||
assert list(doc.ents)[0].text == u'London'
|
||||
|
||||
p
|
||||
| Finally, you can always write to the underlying struct, if you compile
|
||||
| a Cython function. This is easy to do, and allows you to write efficient
|
||||
| native code.
|
||||
|
||||
+code("Example").
|
||||
# cython: infer_types=True
|
||||
from spacy.tokens.doc cimport Doc
|
||||
|
||||
cpdef set_entity(Doc doc, int start, int end, int ent_type):
|
||||
for i in range(start, end):
|
||||
doc.c[i].ent_type = ent_type
|
||||
doc.c[start].ent_iob = 3
|
||||
for i in range(start+1, end):
|
||||
doc.c[i].ent_iob = 2
|
||||
|
||||
p
|
||||
| Obviously, if you write directly to the array of #[code TokenC*] structs,
|
||||
| you'll have responsibility for ensuring that the data is left in a
|
||||
| consistent state.
|
||||
|
||||
|
||||
+h(2, "displacy") The displaCy #[sup ENT] visualizer
|
||||
|
||||
p
|
||||
| The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer]
|
||||
| lets you explore an entity recognition model's behaviour interactively.
|
||||
| If you're training a model, it's very useful to run the visualization
|
||||
| server yourself. To help you do that, we've open-sourced both the
|
||||
| #[+a(gh("spacy-services")) back-end service] and the
|
||||
| #[+a(gh("displacy-ent")) front-end client].
|
||||
|
||||
+codepen("ALxpQO", 450)
|
||||
|
||||
+h(2, "entity-types") Built-in entity types
|
||||
|
||||
+h(3, "entity-types-named") Named types
|
||||
|
||||
+table([ "Type", "Description" ])
|
||||
+row
|
||||
+cell #[code PERSON]
|
||||
+cell People, including fictional.
|
||||
|
||||
+row
|
||||
+cell #[code NORP]
|
||||
+cell Nationalities or religious or political groups.
|
||||
|
||||
+row
|
||||
+cell #[code FACILITY]
|
||||
+cell Buildings, airports, highways, bridges, etc.
|
||||
|
||||
+row
|
||||
+cell #[code ORG]
|
||||
+cell Companies, agencies, institutions, etc.
|
||||
|
||||
+row
|
||||
+cell #[code GPE]
|
||||
+cell Countries, cities, states.
|
||||
|
||||
+row
|
||||
+cell #[code LOC]
|
||||
+cell Non-GPE locations, mountain ranges, bodies of water.
|
||||
|
||||
+row
|
||||
+cell #[code PRODUCT]
|
||||
+cell Objects, vehicles, foods, etc. (Not services.)
|
||||
|
||||
+row
|
||||
+cell #[code EVENT]
|
||||
+cell Named hurricanes, battles, wars, sports events, etc.
|
||||
|
||||
+row
|
||||
+cell #[code WORK_OF_ART]
|
||||
+cell Titles of books, songs, etc.
|
||||
|
||||
+row
|
||||
+cell #[code LANGUAGE]
|
||||
+cell Any named language
|
||||
|
||||
+h(3, "entity-types-numeric") Numeric types
|
||||
|
||||
+table([ "Type", "Description" ])
|
||||
+row
|
||||
+cell #[code DATE]
|
||||
+cell Absolute or relative dates or periods.
|
||||
|
||||
+row
|
||||
+cell #[code TIME]
|
||||
+cell Times smaller than a day.
|
||||
|
||||
+row
|
||||
+cell #[code PERCENT]
|
||||
+cell Percentage, including "%".
|
||||
|
||||
+row
|
||||
+cell #[code MONEY]
|
||||
+cell Monetary values, including unit.
|
||||
|
||||
+row
|
||||
+cell #[code QUANTITY]
|
||||
+cell Measurements, as of weight or distance.
|
||||
|
||||
+row
|
||||
+cell #[code ORDINAL]
|
||||
+cell "first", "second", etc.
|
||||
|
||||
+row
|
||||
+cell #[code CARDINAL]
|
||||
+cell Numerals that do not fall under another type.
|
||||
|
||||
+aside("Install")
|
||||
| The #[+api("load") spacy.load()] function configures a pipeline that
|
||||
| includes all of the available annotators for the given ID. In the example
|
||||
| above, the #[code 'en'] ID tells spaCy to load the default English
|
||||
| pipeline. If you have installed the data with
|
||||
| #[code python -m spacy.en.download] this will include the entity
|
||||
| recognition model.
|
||||
|
||||
+h(2, "updating") Training and updating
|
||||
|
||||
p
|
||||
| To provide training examples to the entity recogniser, you'll first need
|
||||
| to create an instance of the #[code GoldParse] class. You can specify
|
||||
| your annotations in a stand-off format or as token tags.
|
||||
|
||||
+code.
|
||||
import spacy
|
||||
from spacy.gold import GoldParse
|
||||
|
||||
train_data = [
|
||||
('Who is Chaka Khan?', [(7, 17, 'PERSON')]),
|
||||
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])
|
||||
]
|
||||
|
||||
nlp = spacy.load(entity=False, parser=False)
|
||||
ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC'])
|
||||
|
||||
for itn in range(5):
|
||||
random.shuffle(train_data)
|
||||
for raw_text, entity_offsets in train_data:
|
||||
doc = nlp.make_doc(raw_text)
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
|
||||
nlp.tagger(doc)
|
||||
ner.update(doc, gold)
|
||||
ner.model.end_training()
|
||||
|
||||
p
|
||||
| If a character offset in your entity annotations don't fall on a token
|
||||
| boundary, the #[code GoldParse] class will treat that annotation as a
|
||||
| missing value. This allows for more realistic training, because the
|
||||
| entity recogniser is allowed to learn from examples that may feature
|
||||
| tokenizer errors.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = Doc(nlp.vocab, [u'rats', u'make', u'good', u'pets'])
|
||||
gold = GoldParse(doc, [u'U-ANIMAL', u'O', u'O', u'O'])
|
||||
ner = EntityRecognizer(nlp.vocab, entity_types=['ANIMAL'])
|
||||
ner.update(doc, gold)
|
||||
|
||||
p
|
||||
| You can also provide token-level entity annotation, using the
|
||||
| following tagging scheme to describe the entity boundaries:
|
||||
|
||||
+table([ "Tag", "Description" ])
|
||||
+row
|
||||
+cell #[code #[span.u-color-theme B] EGIN]
|
||||
+cell The first token of a multi-token entity.
|
||||
|
||||
+row
|
||||
+cell #[code #[span.u-color-theme I] N]
|
||||
+cell An inner token of a multi-token entity.
|
||||
|
||||
+row
|
||||
+cell #[code #[span.u-color-theme L] AST]
|
||||
+cell The final token of a multi-token entity.
|
||||
|
||||
+row
|
||||
+cell #[code #[span.u-color-theme U] NIT]
|
||||
+cell A single-token entity.
|
||||
|
||||
+row
|
||||
+cell #[code #[span.u-color-theme O] UT]
|
||||
+cell A non-entity token.
|
||||
|
||||
+aside("Why BILUO, not IOB?")
|
||||
| There are several coding schemes for encoding entity annotations as
|
||||
| token tags. These coding schemes are equally expressive, but not
|
||||
| necessarily equally learnable.
|
||||
| #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth]
|
||||
| showed that the minimal #[strong Begin], #[strong In], #[strong Out]
|
||||
| scheme was more difficult to learn than the #[strong BILUO] scheme that
|
||||
| we use, which explicitly marks boundary tokens.
|
||||
|
||||
p
|
||||
| spaCy translates the character offsets into this scheme, in order to
|
||||
| decide the cost of each action given the current state of the entity
|
||||
| recogniser. The costs are then used to calculate the gradient of the
|
||||
| loss, to train the model. The exact algorithm is a pastiche of
|
||||
| well-known methods, and is not currently described in any single
|
||||
| publication. The model is a greedy transition-based parser guided by a
|
||||
| linear model whose weights are learned using the averaged perceptron
|
||||
| loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle]
|
||||
| imitation learning strategy. The transition system is equivalent to the
|
||||
| BILOU tagging scheme.
|
Loading…
Reference in New Issue
Block a user