spaCy/website/docs/tutorials/byo-annotations.jade

include ../../_includes/_mixins

+lead spaCy assumes by default that your data is raw text. However, sometimes your data is partially annotated, e.g. with pre-existing tokenization, part-of-speech tags, etc. This tutorial explains how to use these annotations in spaCy.

+h2 Quick Reference

+table(['Description', 'Usage'], 'code')
    +row
        +cell Use pre-existing tokenization
        +cell #[code.lang-python doc = Doc(nlp.vocab, [('A', True), ('token', False), ('!', False)])]
    +row
        +cell Use pre-existing tokenization (deprecated)
        +cell #[code.lang-python doc = nlp.tokenizer.tokens_from_list([u'A', u'token', u'!'])]

    +row
        +cell Assign pre-existing tags
        +cell #[code.lang-python nlp.tagger.tag_from_strings(doc, ['DT', 'NN'])]
    +row
        +cell Assign named entity annotations from an array
        +cell #[code.lang-python doc.from_array([ENT_TYPE, ENT_IOB], values)]
    +row
        +cell Assign dependency parse annotations from an array
        +cell #[code.lang-python doc.from_array([HEAD, DEP], values)]

+h2 Examples

+code('python', 'Tokenization').
    import spacy

    nlp = spacy.load('en')

    tokens = [u'A', u'list', u'of', u'strings', u'.']

    doc = nlp.tokenizer.tokens_from_list(tokens)

    assert len(doc) == len(tokens)
    # With this method, we don't get to specify how the corresponding string
    # would be spaced, so we have to assume a space before every token.
    assert doc.text == u'A list of strings .'

+code('python', 'Tokenization').
    import spacy
    from spacy.tokens import Doc

    nlp = spacy.load('en')

    tokens = [u'A', u'list', u'of', u'strings', u'.']
    has_space = [True, True, True, False, False]

    doc = Doc(nlp.vocab, orth_and_spaces=zip(tokens, has_space))

    assert len(doc) == len(tokens)
    # Spacing is correct, given by boolean values above.
    assert doc.text == u'A list of strings.'
    # Here's how it would look with different boolean values.
    tokens = [u'A', u'list', u'of', u'strings', u'.']
    has_space = [False, True, True, True, False]
    doc = Doc(nlp.vocab, orth_and_spaces=zip(tokens, has_space))
    assert doc.text == u'Alist of strings .'

+code('python', 'POS Tags').
    import spacy

    nlp = spacy.load('en')

    # Tokenize a string into a Doc, but don't apply the whole pipeline ---
    # that is, don't predict the part-of-speech tags, syntactic parse, named
    # entities, etc.
    doc = nlp.tokenizer(u'A unicode string, untokenized.')
    nlp.tagger.tag_from_strings([u'DT', u'JJ', u'NN', u',', u'VBN', u'.'])
    # Now predict dependency parse and named entities. Note that if you assign
    # tags in a way that's very unlike the behaviour of the POS tagger model,
    # the subsequent models may perform worse. These models use the POS tags
    # as features, so if you give them unexpected tags, you may be giving them
    # run-time conditions that don't resemble the training data.
    nlp.parser(doc)
    nlp.entity(doc)

+code('python', 'Dependency Parse').
    import spacy
    from spacy.attrs import HEAD, DEP
    from spacy.symbols import det, nmod, root, punct
    from numpy import ndarray

    nlp = spacy.load('en')

    # Get the Doc object, and apply the pipeline except the dependency parser
    doc = nlp(u'A unicode string.', parse=False)

    columns = [HEAD, DEP]
    values = ndarray(shape=(len(columns), len(doc)), dtype='int32')
    # Syntactic parse specified as head offsets
    heads = [2, 1, 0, -1]
    # Integer IDs for the dependency labels. See the parse in the displaCy
    # demo at spacy.io/demos/displacy
    labels = [det, nmod, root, punct]
    values[0] = heads
    values[1] = labels
    doc.from_array(columns, values)

+code('python', 'Named Entities').
    import spacy
    from spacy.attrs import ENT_TYPE, ENT_IOB
    from spacy.symbols import PERSON, ORG
    from numpy import ndarray

    nlp = spacy.load('en')

    # Get the Doc object, and apply the pipeline except the entity recognizer
    doc = nlp(u'My name is Matt.', entity=False)

    columns = [ENT_TYPE, ENT_IOB]
    values = ndarray(shape=(len(columns), len(doc)), dtype='int32')
    # IOB values are 0=missing, 1=I, 2=O, 3=B
    values[0] = [1, 1, 1, 3, 1]
    values[1] = [0, 0, 0, PERSON, 0]
    doc.from_array(columns, values)