spaCy/website/docs/tutorials/byo-annotations.jade
2016-10-19 00:19:42 +02:00

118 lines
4.3 KiB
Plaintext

include ../../_includes/_mixins
p.u-text-large spaCy assumes by default that your data is raw text. However, sometimes your data is partially annotated, e.g. with pre-existing tokenization, part-of-speech tags, etc. This tutorial explains how to use these annotations in spaCy.
+h(2, "quick-reference") Quick Reference
+table(['Description', 'Usage'], 'code')
+row
+cell Use pre-existing tokenization
+cell #[code.lang-python doc = Doc(nlp.vocab, [('A', True), ('token', False), ('!', False)])]
+row
+cell Use pre-existing tokenization (deprecated)
+cell #[code.lang-python doc = nlp.tokenizer.tokens_from_list([u'A', u'token', u'!'])]
+row
+cell Assign pre-existing tags
+cell #[code.lang-python nlp.tagger.tag_from_strings(doc, ['DT', 'NN'])]
+row
+cell Assign named entity annotations from an array
+cell #[code.lang-python doc.from_array([ENT_TYPE, ENT_IOB], values)]
+row
+cell Assign dependency parse annotations from an array
+cell #[code.lang-python doc.from_array([HEAD, DEP], values)]
+h(2, "examples") Examples
+code('python', 'Tokenization').
import spacy
nlp = spacy.load('en')
tokens = [u'A', u'list', u'of', u'strings', u'.']
doc = nlp.tokenizer.tokens_from_list(tokens)
assert len(doc) == len(tokens)
# With this method, we don't get to specify how the corresponding string
# would be spaced, so we have to assume a space before every token.
assert doc.text == u'A list of strings .'
+code('python', 'Tokenization').
import spacy
from spacy.tokens import Doc
nlp = spacy.load('en')
tokens = [u'A', u'list', u'of', u'strings', u'.']
has_space = [True, True, True, False, False]
doc = Doc(nlp.vocab, orth_and_spaces=zip(tokens, has_space))
assert len(doc) == len(tokens)
# Spacing is correct, given by boolean values above.
assert doc.text == u'A list of strings.'
# Here's how it would look with different boolean values.
tokens = [u'A', u'list', u'of', u'strings', u'.']
has_space = [False, True, True, True, False]
doc = Doc(nlp.vocab, orth_and_spaces=zip(tokens, has_space))
assert doc.text == u'Alist of strings .'
+code('python', 'POS Tags').
import spacy
nlp = spacy.load('en')
# Tokenize a string into a Doc, but don't apply the whole pipeline ---
# that is, don't predict the part-of-speech tags, syntactic parse, named
# entities, etc.
doc = nlp.tokenizer(u'A unicode string, untokenized.')
nlp.tagger.tag_from_strings([u'DT', u'JJ', u'NN', u',', u'VBN', u'.'])
# Now predict dependency parse and named entities. Note that if you assign
# tags in a way that's very unlike the behaviour of the POS tagger model,
# the subsequent models may perform worse. These models use the POS tags
# as features, so if you give them unexpected tags, you may be giving them
# run-time conditions that don't resemble the training data.
nlp.parser(doc)
nlp.entity(doc)
+code('python', 'Dependency Parse').
import spacy
from spacy.attrs import HEAD, DEP
from spacy.symbols import det, nmod, root, punct
from numpy import ndarray
nlp = spacy.load('en')
# Get the Doc object, and apply the pipeline except the dependency parser
doc = nlp(u'A unicode string.', parse=False)
columns = [HEAD, DEP]
values = ndarray(shape=(len(columns), len(doc)), dtype='int32')
# Syntactic parse specified as head offsets
heads = [2, 1, 0, -1]
# Integer IDs for the dependency labels. See the parse in the displaCy
# demo at spacy.io/demos/displacy
labels = [det, nmod, root, punct]
values[0] = heads
values[1] = labels
doc.from_array(columns, values)
+code('python', 'Named Entities').
import spacy
from spacy.attrs import ENT_TYPE, ENT_IOB
from spacy.symbols import PERSON, ORG
from numpy import ndarray
nlp = spacy.load('en')
# Get the Doc object, and apply the pipeline except the entity recognizer
doc = nlp(u'My name is Matt.', entity=False)
columns = [ENT_TYPE, ENT_IOB]
values = ndarray(shape=(len(columns), len(doc)), dtype='int32')
# IOB values are 0=missing, 1=I, 2=O, 3=B
values[0] = [1, 1, 1, 3, 1]
values[1] = [0, 0, 0, PERSON, 0]
doc.from_array(columns, values)