mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Rewrite custom tokenizer docs
This commit is contained in:
		
							parent
							
								
									0f48fb1f97
								
							
						
					
					
						commit
						d122bbc908
					
				| 
						 | 
					@ -11,16 +11,10 @@ p
 | 
				
			||||||
    |  #[code spaces] booleans, which allow you to maintain alignment of the
 | 
					    |  #[code spaces] booleans, which allow you to maintain alignment of the
 | 
				
			||||||
    |  tokens into the original string.
 | 
					    |  tokens into the original string.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
+aside("spaCy's data model")
 | 
					 | 
				
			||||||
    |  The main point to keep in mind is that spaCy's #[code Doc] doesn't
 | 
					 | 
				
			||||||
    |  copy or refer to the original string. The string is reconstructed from
 | 
					 | 
				
			||||||
    |  the tokens when required.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
+h(2, "101") Tokenizer 101
 | 
					+h(2, "101") Tokenizer 101
 | 
				
			||||||
 | 
					
 | 
				
			||||||
include _spacy-101/_tokenization
 | 
					include _spacy-101/_tokenization
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
+h(3, "101-data") Tokenizer data
 | 
					+h(3, "101-data") Tokenizer data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
p
 | 
					p
 | 
				
			||||||
| 
						 | 
					@ -221,27 +215,68 @@ p
 | 
				
			||||||
+h(2, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline
 | 
					+h(2, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline
 | 
				
			||||||
 | 
					
 | 
				
			||||||
p
 | 
					p
 | 
				
			||||||
    |  You can pass a custom tokenizer using the #[code make_doc] keyword, when
 | 
					    |  The tokenizer is the first component of the processing pipeline and the
 | 
				
			||||||
    |  you're creating the pipeline:
 | 
					    |  only one that can't be replaced by writing to #[code nlp.pipeline]. This
 | 
				
			||||||
 | 
					    |  is because it has a different signature from all the other components:
 | 
				
			||||||
 | 
					    |  it takes a text and returns a #[code Doc], whereas all other components
 | 
				
			||||||
 | 
					    |  expect to already receive a tokenized #[code Doc].
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+image
 | 
				
			||||||
 | 
					    include ../../assets/img/docs/pipeline.svg
 | 
				
			||||||
 | 
					    .u-text-right
 | 
				
			||||||
 | 
					        +button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic
 | 
				
			||||||
 | 
					
 | 
				
			||||||
+code.
 | 
					 | 
				
			||||||
    nlp = spacy.load('en', make_doc=my_tokenizer)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
p
 | 
					p
 | 
				
			||||||
    |  However, this approach often leaves us with a chicken-and-egg problem.
 | 
					    |  To overwrite the existing tokenizer, you need to replace
 | 
				
			||||||
    |  To construct the tokenizer, we usually want attributes of the #[code nlp]
 | 
					    |  #[code nlp.tokenizer] with a custom function that takes a text, and
 | 
				
			||||||
    |  pipeline. Specifically, we want the tokenizer to hold a reference to the
 | 
					    |  returns a #[code Doc].
 | 
				
			||||||
    |  pipeline's vocabulary object. Let's say we have the following class as
 | 
					
 | 
				
			||||||
    |  our tokenizer:
 | 
					+code.
 | 
				
			||||||
 | 
					    nlp = spacy.load('en')
 | 
				
			||||||
 | 
					    nlp.tokenizer = my_tokenizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+table(["Argument", "Type", "Description"])
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code text]
 | 
				
			||||||
 | 
					        +cell unicode
 | 
				
			||||||
 | 
					        +cell The raw text to tokenize.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +footrow
 | 
				
			||||||
 | 
					        +cell returns
 | 
				
			||||||
 | 
					        +cell #[code Doc]
 | 
				
			||||||
 | 
					        +cell The tokenized document.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+infobox("Important note: using a custom tokenizer")
 | 
				
			||||||
 | 
					    .o-block
 | 
				
			||||||
 | 
					        |  In spaCy v1.x, you had to add a custom tokenizer by passing it to the
 | 
				
			||||||
 | 
					        |  #[code make_doc] keyword argument, or by passing a tokenizer "factory"
 | 
				
			||||||
 | 
					        |  to #[code create_make_doc]. This was unnecessarily complicated. Since
 | 
				
			||||||
 | 
					        |  spaCy v2.0, you can simply write to #[code nlp.tokenizer]. If your
 | 
				
			||||||
 | 
					        |  tokenizer needs the vocab, you can write a function and use
 | 
				
			||||||
 | 
					        |  #[code nlp.vocab].
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +code-new.
 | 
				
			||||||
 | 
					        nlp.tokenizer = my_tokenizer
 | 
				
			||||||
 | 
					        nlp.tokenizer = my_tokenizer_factory(nlp.vocab)
 | 
				
			||||||
 | 
					    +code-old.
 | 
				
			||||||
 | 
					        nlp = spacy.load('en', make_doc=my_tokenizer)
 | 
				
			||||||
 | 
					        nlp = spacy.load('en', create_make_doc=my_tokenizer_factory)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+h(3, "custom-tokenizer-example") Example: A custom whitespace tokenizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					p
 | 
				
			||||||
 | 
					    |  To construct the tokenizer, we usually want attributes of the #[code nlp]
 | 
				
			||||||
 | 
					    |  pipeline. Specifically, we want the tokenizer to hold a reference to the
 | 
				
			||||||
 | 
					    |  vocabulary object. Let's say we have the following class as
 | 
				
			||||||
 | 
					    |  our tokenizer:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
+code.
 | 
					+code.
 | 
				
			||||||
    import spacy
 | 
					 | 
				
			||||||
    from spacy.tokens import Doc
 | 
					    from spacy.tokens import Doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    class WhitespaceTokenizer(object):
 | 
					    class WhitespaceTokenizer(object):
 | 
				
			||||||
        def __init__(self, nlp):
 | 
					        def __init__(self, vocab):
 | 
				
			||||||
            self.vocab = nlp.vocab
 | 
					            self.vocab = vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def __call__(self, text):
 | 
					        def __call__(self, text):
 | 
				
			||||||
            words = text.split(' ')
 | 
					            words = text.split(' ')
 | 
				
			||||||
| 
						 | 
					@ -250,28 +285,12 @@ p
 | 
				
			||||||
            return Doc(self.vocab, words=words, spaces=spaces)
 | 
					            return Doc(self.vocab, words=words, spaces=spaces)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
p
 | 
					p
 | 
				
			||||||
    |  As you can see, we need a #[code vocab] instance to construct this — but
 | 
					    |  As you can see, we need a #[code Vocab] instance to construct this — but
 | 
				
			||||||
    |  we won't get the #[code vocab] instance until we get back the #[code nlp]
 | 
					    |  we won't have it until we get back the loaded #[code nlp] object. The
 | 
				
			||||||
    |  object from #[code spacy.load()]. The simplest solution is to build the
 | 
					    |  simplest solution is to build the tokenizer in two steps. This also means
 | 
				
			||||||
    |  object in two steps:
 | 
					    |  that you can reuse the "tokenizer factory" and initialise it with
 | 
				
			||||||
 | 
					    |  different instances of #[code Vocab].
 | 
				
			||||||
 | 
					
 | 
				
			||||||
+code.
 | 
					+code.
 | 
				
			||||||
    nlp = spacy.load('en')
 | 
					    nlp = spacy.load('en')
 | 
				
			||||||
    nlp.make_doc = WhitespaceTokenizer(nlp)
 | 
					    nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
 | 
				
			||||||
 | 
					 | 
				
			||||||
p
 | 
					 | 
				
			||||||
    |  You can instead pass the class to the #[code create_make_doc] keyword,
 | 
					 | 
				
			||||||
    |  which is invoked as callback once the #[code nlp] object is ready:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
+code.
 | 
					 | 
				
			||||||
    nlp = spacy.load('en', create_make_doc=WhitespaceTokenizer)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
p
 | 
					 | 
				
			||||||
    |  Finally, you can of course create your own subclasses, and create a bound
 | 
					 | 
				
			||||||
    |  #[code make_doc] method. The disadvantage of this approach is that spaCy
 | 
					 | 
				
			||||||
    |  uses inheritance to give each language-specific pipeline its own class.
 | 
					 | 
				
			||||||
    |  If you're working with multiple languages, a naive solution will
 | 
					 | 
				
			||||||
    |  therefore require one custom class per language you're working with.
 | 
					 | 
				
			||||||
    |  This might be at least annoying. You may be able to do something more
 | 
					 | 
				
			||||||
    |  generic by doing some clever magic with metaclasses or mixins, if that's
 | 
					 | 
				
			||||||
    |  the sort of thing you're into.
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user