mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			119 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			119 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
include ../../_includes/_mixins
 | 
						|
 | 
						|
+lead spaCy assumes by default that your data is raw text. However, sometimes your data is partially annotated, e.g. with pre-existing tokenization, part-of-speech tags, etc. This tutorial explains how to use these annotations in spaCy.
 | 
						|
 | 
						|
+h2 Quick Reference
 | 
						|
 | 
						|
+table(['Description', 'Usage'], 'code')
 | 
						|
    +row
 | 
						|
        +cell Use pre-existing tokenization
 | 
						|
        +cell #[code.lang-python doc = Doc(nlp.vocab, [('A', True), ('token', False), ('!', False)])]
 | 
						|
    +row
 | 
						|
        +cell Use pre-existing tokenization (deprecated)
 | 
						|
        +cell #[code.lang-python doc = nlp.tokenizer.tokens_from_list([u'A', u'token', u'!'])]
 | 
						|
 
 | 
						|
    +row
 | 
						|
        +cell Assign pre-existing tags
 | 
						|
        +cell #[code.lang-python nlp.tagger.tag_from_strings(doc, ['DT', 'NN'])]
 | 
						|
    +row
 | 
						|
        +cell Assign named entity annotations from an array
 | 
						|
        +cell #[code.lang-python doc.from_array([ENT_TYPE, ENT_IOB], values)]
 | 
						|
    +row
 | 
						|
        +cell Assign dependency parse annotations from an array
 | 
						|
        +cell #[code.lang-python doc.from_array([HEAD, DEP], values)]
 | 
						|
 | 
						|
+h2 Examples
 | 
						|
 | 
						|
+code('python', 'Tokenization').
 | 
						|
    import spacy
 | 
						|
 | 
						|
    nlp = spacy.load('en')
 | 
						|
 | 
						|
    tokens = [u'A', u'list', u'of', u'strings', u'.']
 | 
						|
 | 
						|
    doc = nlp.tokenizer.tokens_from_list(tokens)
 | 
						|
 | 
						|
    assert len(doc) == len(tokens)
 | 
						|
    # With this method, we don't get to specify how the corresponding string
 | 
						|
    # would be spaced, so we have to assume a space before every token.
 | 
						|
    assert doc.text == u'A list of strings .'
 | 
						|
 | 
						|
+code('python', 'Tokenization').
 | 
						|
    import spacy
 | 
						|
    from spacy.tokens import Doc
 | 
						|
 | 
						|
    nlp = spacy.load('en')
 | 
						|
 | 
						|
    tokens = [u'A', u'list', u'of', u'strings', u'.']
 | 
						|
    has_space = [True, True, True, False, False]
 | 
						|
 | 
						|
    doc = Doc(nlp.vocab, orth_and_spaces=zip(tokens, has_space))
 | 
						|
 | 
						|
    assert len(doc) == len(tokens)
 | 
						|
    # Spacing is correct, given by boolean values above.
 | 
						|
    assert doc.text == u'A list of strings.'
 | 
						|
    # Here's how it would look with different boolean values.
 | 
						|
    tokens = [u'A', u'list', u'of', u'strings', u'.']
 | 
						|
    has_space = [False, True, True, True, False]
 | 
						|
    doc = Doc(nlp.vocab, orth_and_spaces=zip(tokens, has_space))
 | 
						|
    assert doc.text == u'Alist of strings .'
 | 
						|
 | 
						|
+code('python', 'POS Tags').
 | 
						|
    import spacy
 | 
						|
 | 
						|
    nlp = spacy.load('en')
 | 
						|
 | 
						|
    # Tokenize a string into a Doc, but don't apply the whole pipeline ---
 | 
						|
    # that is, don't predict the part-of-speech tags, syntactic parse, named
 | 
						|
    # entities, etc.
 | 
						|
    doc = nlp.tokenizer(u'A unicode string, untokenized.')
 | 
						|
    nlp.tagger.tag_from_strings([u'DT', u'JJ', u'NN', u',', u'VBN', u'.'])
 | 
						|
    # Now predict dependency parse and named entities. Note that if you assign
 | 
						|
    # tags in a way that's very unlike the behaviour of the POS tagger model,
 | 
						|
    # the subsequent models may perform worse. These models use the POS tags
 | 
						|
    # as features, so if you give them unexpected tags, you may be giving them
 | 
						|
    # run-time conditions that don't resemble the training data.
 | 
						|
    nlp.parser(doc)
 | 
						|
    nlp.entity(doc)
 | 
						|
 | 
						|
+code('python', 'Dependency Parse').
 | 
						|
    import spacy
 | 
						|
    from spacy.attrs import HEAD, DEP
 | 
						|
    from spacy.symbols import det, nmod, root, punct
 | 
						|
    from numpy import ndarray
 | 
						|
 | 
						|
    nlp = spacy.load('en')
 | 
						|
 | 
						|
    # Get the Doc object, and apply the pipeline except the dependency parser
 | 
						|
    doc = nlp(u'A unicode string.', parse=False)
 | 
						|
 | 
						|
    columns = [HEAD, DEP]
 | 
						|
    values = ndarray(shape=(len(columns), len(doc)), dtype='int32')
 | 
						|
    # Syntactic parse specified as head offsets
 | 
						|
    heads = [2, 1, 0, -1]
 | 
						|
    # Integer IDs for the dependency labels. See the parse in the displaCy
 | 
						|
    # demo at spacy.io/demos/displacy
 | 
						|
    labels = [det, nmod, root, punct]
 | 
						|
    values[0] = heads
 | 
						|
    values[1] = labels
 | 
						|
    doc.from_array(columns, values)
 | 
						|
 | 
						|
+code('python', 'Named Entities').
 | 
						|
    import spacy
 | 
						|
    from spacy.attrs import ENT_TYPE, ENT_IOB
 | 
						|
    from spacy.symbols import PERSON, ORG
 | 
						|
    from numpy import ndarray
 | 
						|
 | 
						|
    nlp = spacy.load('en')
 | 
						|
 | 
						|
    # Get the Doc object, and apply the pipeline except the entity recognizer
 | 
						|
    doc = nlp(u'My name is Matt.', entity=False)
 | 
						|
 | 
						|
    columns = [ENT_TYPE, ENT_IOB]
 | 
						|
    values = ndarray(shape=(len(columns), len(doc)), dtype='int32')
 | 
						|
    # IOB values are 0=missing, 1=I, 2=O, 3=B 
 | 
						|
    values[0] = [1, 1, 1, 3, 1]
 | 
						|
    values[1] = [0, 0, 0, PERSON, 0]
 | 
						|
    doc.from_array(columns, values)
 | 
						|
 |