mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 05:31:15 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			368 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			368 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| //- 💫 DOCS > USAGE > LINGUISTIC FEATURES > TOKENIZATION
 | ||
| 
 | ||
| p
 | ||
|     |  Tokenization is the task of splitting a text into meaningful segments,
 | ||
|     |  called #[em tokens].  The input to the tokenizer is a unicode text, and
 | ||
|     |  the output is a #[+api("doc") #[code Doc]] object. To construct a
 | ||
|     |  #[code Doc] object, you need a #[+api("vocab") #[code Vocab]] instance,
 | ||
|     |  a sequence of #[code word] strings, and optionally a sequence of
 | ||
|     |  #[code spaces] booleans, which allow you to maintain alignment of the
 | ||
|     |  tokens into the original string.
 | ||
| 
 | ||
| include ../_spacy-101/_tokenization
 | ||
| 
 | ||
| +h(4, "101-data") Tokenizer data
 | ||
| 
 | ||
| p
 | ||
|     |  #[strong Global] and #[strong language-specific] tokenizer data is
 | ||
|     |  supplied via the language data in
 | ||
|     |  #[+src(gh("spaCy", "spacy/lang")) #[code spacy/lang]].
 | ||
|     |  The tokenizer exceptions define special cases like "don't" in English,
 | ||
|     |  which needs to be split into two tokens: #[code {ORTH: "do"}] and
 | ||
|     |  #[code {ORTH: "n't", LEMMA: "not"}]. The prefixes, suffixes and infixes
 | ||
|     |  mosty define punctuation rules – for example, when to split off periods
 | ||
|     |  (at the end of a sentence), and when to leave token containing periods
 | ||
|     |  intact (abbreviations like "U.S.").
 | ||
| 
 | ||
| +graphic("/assets/img/language_data.svg")
 | ||
|     include ../../assets/img/language_data.svg
 | ||
| 
 | ||
| +infobox
 | ||
|     |  For more details on the language-specific data, see the
 | ||
|     |  usage guide on #[+a("/usage/adding-languages") adding languages].
 | ||
| 
 | ||
| +h(3, "special-cases") Adding special case tokenization rules
 | ||
| 
 | ||
| p
 | ||
|     |  Most domains have at least some idiosyncrasies that require custom
 | ||
|     |  tokenization rules. This could be very certain expressions, or
 | ||
|     |  abbreviations only used in this specific field.
 | ||
| 
 | ||
| +aside("Language data vs. custom tokenization")
 | ||
|     |  Tokenization rules that are specific to one language, but can be
 | ||
|     |  #[strong generalised across that language] should ideally live in the
 | ||
|     |  language data in #[+src(gh("spaCy", "spacy/lang")) #[code spacy/lang]] – we
 | ||
|     |  always appreciate pull requests! Anything that's specific to a domain or
 | ||
|     |  text type – like financial trading abbreviations, or Bavarian youth slang
 | ||
|     |  – should be added as a special case rule to your tokenizer instance. If
 | ||
|     |  you're dealing with a lot of customisations, it might make sense to create
 | ||
|     |  an entirely custom subclass.
 | ||
| 
 | ||
| p
 | ||
|     |  Here's how to add a special case rule to an existing
 | ||
|     |  #[+api("tokenizer") #[code Tokenizer]] instance:
 | ||
| 
 | ||
| +code-exec.
 | ||
|     import spacy
 | ||
|     from spacy.symbols import ORTH, LEMMA, POS, TAG
 | ||
| 
 | ||
|     nlp = spacy.load('en_core_web_sm')
 | ||
|     doc = nlp(u'gimme that')  # phrase to tokenize
 | ||
|     print([w.text for w in doc])  # ['gimme', 'that']
 | ||
| 
 | ||
|     # add special case rule
 | ||
|     special_case = [{ORTH: u'gim', LEMMA: u'give', POS: u'VERB'}, {ORTH: u'me'}]
 | ||
|     nlp.tokenizer.add_special_case(u'gimme', special_case)
 | ||
| 
 | ||
|     # check new tokenization
 | ||
|     print([w.text for w in nlp(u'gimme that')])  # ['gim', 'me', 'that']
 | ||
| 
 | ||
|     # Pronoun lemma is returned as -PRON-!
 | ||
|     print([w.lemma_ for w in nlp(u'gimme that')])  # ['give', '-PRON-', 'that']
 | ||
| 
 | ||
| p
 | ||
|     |  For details on spaCy's custom pronoun lemma #[code -PRON-],
 | ||
|     |  #[+a("/usage/#pron-lemma") see here].
 | ||
|     |  The special case doesn't have to match an entire whitespace-delimited
 | ||
|     |  substring. The tokenizer will incrementally split off punctuation, and
 | ||
|     |  keep looking up the remaining substring:
 | ||
| 
 | ||
| +code.
 | ||
|     assert 'gimme' not in [w.text for w in nlp(u'gimme!')]
 | ||
|     assert 'gimme' not in [w.text for w in nlp(u'("...gimme...?")')]
 | ||
| 
 | ||
| p
 | ||
|     |  The special case rules have precedence over the punctuation splitting:
 | ||
| 
 | ||
| +code.
 | ||
|     special_case = [{ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]
 | ||
|     nlp.tokenizer.add_special_case(u'...gimme...?', special_case)
 | ||
|     assert len(nlp(u'...gimme...?')) == 1
 | ||
| 
 | ||
| p
 | ||
|     |  Because the special-case rules allow you to set arbitrary token
 | ||
|     |  attributes, such as the part-of-speech, lemma, etc, they make a good
 | ||
|     |  mechanism for arbitrary fix-up rules. Having this logic live in the
 | ||
|     |  tokenizer isn't very satisfying from a design perspective, however, so
 | ||
|     |  the API may eventually be exposed on the
 | ||
|     |  #[+api("language") #[code Language]] class itself.
 | ||
| 
 | ||
| 
 | ||
| +h(3, "how-tokenizer-works") How spaCy's tokenizer works
 | ||
| 
 | ||
| p
 | ||
|     |  spaCy introduces a novel tokenization algorithm, that gives a better
 | ||
|     |  balance between performance, ease of definition, and ease of alignment
 | ||
|     |  into the original string.
 | ||
| 
 | ||
| p
 | ||
|     |  After consuming a prefix or infix, we consult the special cases again.
 | ||
|     |  We want the special cases to handle things like "don't" in English, and
 | ||
|     |  we want the same rule to work for "(don't)!". We do this by splitting
 | ||
|     |  off the open bracket, then the exclamation, then the close bracket, and
 | ||
|     |  finally matching the special-case. Here's an implementation of the
 | ||
|     |  algorithm in Python, optimized for readability rather than performance:
 | ||
| 
 | ||
| +code.
 | ||
|     def tokenizer_pseudo_code(text, special_cases,
 | ||
|                               find_prefix, find_suffix, find_infixes):
 | ||
|         tokens = []
 | ||
|         for substring in text.split(' '):
 | ||
|             suffixes = []
 | ||
|             while substring:
 | ||
|                 if substring in special_cases:
 | ||
|                     tokens.extend(special_cases[substring])
 | ||
|                     substring = ''
 | ||
|                 elif find_prefix(substring) is not None:
 | ||
|                     split = find_prefix(substring)
 | ||
|                     tokens.append(substring[:split])
 | ||
|                     substring = substring[split:]
 | ||
|                 elif find_suffix(substring) is not None:
 | ||
|                     split = find_suffix(substring)
 | ||
|                     suffixes.append(substring[-split:])
 | ||
|                     substring = substring[:-split]
 | ||
|                 elif find_infixes(substring):
 | ||
|                     infixes = find_infixes(substring)
 | ||
|                     offset = 0
 | ||
|                     for match in infixes:
 | ||
|                         tokens.append(substring[offset : match.start()])
 | ||
|                         tokens.append(substring[match.start() : match.end()])
 | ||
|                         offset = match.end()
 | ||
|                     substring = substring[offset:]
 | ||
|                 else:
 | ||
|                     tokens.append(substring)
 | ||
|                     substring = ''
 | ||
|             tokens.extend(reversed(suffixes))
 | ||
|         return tokens
 | ||
| 
 | ||
| p
 | ||
|     |  The algorithm can be summarized as follows:
 | ||
| 
 | ||
| +list("numbers")
 | ||
|     +item Iterate over space-separated substrings
 | ||
|     +item
 | ||
|         |  Check whether we have an explicitly defined rule for this substring.
 | ||
|         |  If we do, use it.
 | ||
|     +item Otherwise, try to consume a prefix.
 | ||
|     +item
 | ||
|         |  If we consumed a prefix, go back to the beginning of the loop, so
 | ||
|         |  that special-cases always get priority.
 | ||
|     +item If we didn't consume a prefix, try to consume a suffix.
 | ||
|     +item
 | ||
|         |  If we can't consume a prefix or suffix, look for "infixes" — stuff
 | ||
|         |  like hyphens etc.
 | ||
|     +item Once we can't consume any more of the string, handle it as a single token.
 | ||
| 
 | ||
| +h(3, "native-tokenizers") Customizing spaCy's Tokenizer class
 | ||
| 
 | ||
| p
 | ||
|     |  Let's imagine you wanted to create a tokenizer for a new language or
 | ||
|     |  specific domain. There are five things you would need to define:
 | ||
| 
 | ||
| +list("numbers")
 | ||
|     +item
 | ||
|         |  A dictionary of #[strong special cases]. This handles things like
 | ||
|         |  contractions, units of measurement, emoticons, certain
 | ||
|         |  abbreviations, etc.
 | ||
| 
 | ||
|     +item
 | ||
|         |  A function #[code prefix_search], to handle
 | ||
|         |  #[strong preceding punctuation], such as open quotes, open brackets,
 | ||
|         |  etc
 | ||
| 
 | ||
|     +item
 | ||
|         |  A function #[code suffix_search], to handle
 | ||
|         |  #[strong succeeding punctuation], such as commas, periods, close
 | ||
|         |  quotes, etc.
 | ||
| 
 | ||
|     +item
 | ||
|         |  A function #[code infixes_finditer], to handle non-whitespace
 | ||
|         |  separators, such as hyphens etc.
 | ||
| 
 | ||
|     +item
 | ||
|         |  An optional boolean function #[code token_match] matching strings
 | ||
|         |  that should never be split, overriding the previous rules.
 | ||
|         |  Useful for things like URLs or numbers.
 | ||
| 
 | ||
| p
 | ||
|     |  You shouldn't usually need to create a #[code Tokenizer] subclass.
 | ||
|     |  Standard usage is to use #[code re.compile()] to build a regular
 | ||
|     |  expression object, and pass its #[code .search()] and
 | ||
|     |  #[code .finditer()] methods:
 | ||
| 
 | ||
| +code-exec.
 | ||
|     import re
 | ||
|     import spacy
 | ||
|     from spacy.tokenizer import Tokenizer
 | ||
| 
 | ||
|     prefix_re = re.compile(r'''^[\[\("']''')
 | ||
|     suffix_re = re.compile(r'''[\]\)"']$''')
 | ||
|     infix_re = re.compile(r'''[-~]''')
 | ||
|     simple_url_re = re.compile(r'''^https?://''')
 | ||
| 
 | ||
|     def custom_tokenizer(nlp):
 | ||
|         return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
 | ||
|                                     suffix_search=suffix_re.search,
 | ||
|                                     infix_finditer=infix_re.finditer,
 | ||
|                                     token_match=simple_url_re.match)
 | ||
| 
 | ||
|     nlp = spacy.load('en_core_web_sm')
 | ||
|     nlp.tokenizer = custom_tokenizer(nlp)
 | ||
|     doc = nlp(u"hello-world.")
 | ||
|     print([t.text for t in doc])
 | ||
| 
 | ||
| p
 | ||
|     |  If you need to subclass the tokenizer instead, the relevant methods to
 | ||
|     |  specialize are #[code find_prefix], #[code find_suffix] and
 | ||
|     |  #[code find_infix].
 | ||
| 
 | ||
| +infobox("Important note", "⚠️")
 | ||
|     |  When customising the prefix, suffix and infix handling, remember that
 | ||
|     |  you're passing in #[strong functions] for spaCy to execute, e.g.
 | ||
|     |  #[code prefix_re.search] – not just the regular expressions. This means
 | ||
|     |  that your functions also need to define how the rules should be applied.
 | ||
|     |  For example, if you're adding your own prefix rules, you need
 | ||
|     |  to make sure they're only applied to characters at the
 | ||
|     |  #[strong beginning of a token], e.g. by adding #[code ^]. Similarly,
 | ||
|     |  suffix rules should only be applied at the #[strong end of a token],
 | ||
|     |  so your expression should end with a #[code $].
 | ||
| 
 | ||
| +h(3, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline
 | ||
| 
 | ||
| p
 | ||
|     |  The tokenizer is the first component of the processing pipeline and the
 | ||
|     |  only one that can't be replaced by writing to #[code nlp.pipeline]. This
 | ||
|     |  is because it has a different signature from all the other components:
 | ||
|     |  it takes a text and returns a #[code Doc], whereas all other components
 | ||
|     |  expect to already receive a tokenized #[code Doc].
 | ||
| 
 | ||
| +graphic("/assets/img/pipeline.svg")
 | ||
|     include ../../assets/img/pipeline.svg
 | ||
| 
 | ||
| p
 | ||
|     |  To overwrite the existing tokenizer, you need to replace
 | ||
|     |  #[code nlp.tokenizer] with a custom function that takes a text, and
 | ||
|     |  returns a #[code Doc].
 | ||
| 
 | ||
| +code.
 | ||
|     nlp = spacy.load('en')
 | ||
|     nlp.tokenizer = my_tokenizer
 | ||
| 
 | ||
| +table(["Argument", "Type", "Description"])
 | ||
|     +row
 | ||
|         +cell #[code text]
 | ||
|         +cell unicode
 | ||
|         +cell The raw text to tokenize.
 | ||
| 
 | ||
|     +row("foot")
 | ||
|         +cell returns
 | ||
|         +cell #[code Doc]
 | ||
|         +cell The tokenized document.
 | ||
| 
 | ||
| +infobox("Important note: using a custom tokenizer")
 | ||
|     .o-block
 | ||
|         |  In spaCy v1.x, you had to add a custom tokenizer by passing it to the
 | ||
|         |  #[code make_doc] keyword argument, or by passing a tokenizer "factory"
 | ||
|         |  to #[code create_make_doc]. This was unnecessarily complicated. Since
 | ||
|         |  spaCy v2.0, you can write to #[code nlp.tokenizer] instead. If your
 | ||
|         |  tokenizer needs the vocab, you can write a function and use
 | ||
|         |  #[code nlp.vocab].
 | ||
| 
 | ||
|     +code-new.
 | ||
|         nlp.tokenizer = my_tokenizer
 | ||
|         nlp.tokenizer = my_tokenizer_factory(nlp.vocab)
 | ||
|     +code-old.
 | ||
|         nlp = spacy.load('en', make_doc=my_tokenizer)
 | ||
|         nlp = spacy.load('en', create_make_doc=my_tokenizer_factory)
 | ||
| 
 | ||
| +h(3, "custom-tokenizer-example") Example: A custom whitespace tokenizer
 | ||
| 
 | ||
| p
 | ||
|     |  To construct the tokenizer, we usually want attributes of the #[code nlp]
 | ||
|     |  pipeline. Specifically, we want the tokenizer to hold a reference to the
 | ||
|     |  vocabulary object. Let's say we have the following class as
 | ||
|     |  our tokenizer:
 | ||
| 
 | ||
| +code-exec.
 | ||
|     import spacy
 | ||
|     from spacy.tokens import Doc
 | ||
| 
 | ||
|     class WhitespaceTokenizer(object):
 | ||
|         def __init__(self, vocab):
 | ||
|             self.vocab = vocab
 | ||
| 
 | ||
|         def __call__(self, text):
 | ||
|             words = text.split(' ')
 | ||
|             # All tokens 'own' a subsequent space character in this tokenizer
 | ||
|             spaces = [True] * len(words)
 | ||
|             return Doc(self.vocab, words=words, spaces=spaces)
 | ||
| 
 | ||
|     nlp = spacy.load('en_core_web_sm')
 | ||
|     nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
 | ||
|     doc = nlp(u"What's happened to me? he thought. It wasn't a dream.")
 | ||
|     print([t.text for t in doc])
 | ||
| 
 | ||
| p
 | ||
|     |  As you can see, we need a #[code Vocab] instance to construct this — but
 | ||
|     |  we won't have it until we get back the loaded #[code nlp] object. The
 | ||
|     |  simplest solution is to build the tokenizer in two steps. This also means
 | ||
|     |  that you can reuse the "tokenizer factory" and initialise it with
 | ||
|     |  different instances of #[code Vocab].
 | ||
| 
 | ||
| +h(3, "own-annotations") Bringing your own annotations
 | ||
| 
 | ||
| p
 | ||
|     |  spaCy generally assumes by default that your data is raw text. However,
 | ||
|     |  sometimes your data is partially annotated, e.g. with pre-existing
 | ||
|     |  tokenization, part-of-speech tags, etc. The most common situation is
 | ||
|     |  that you have pre-defined tokenization. If you have a list of strings,
 | ||
|     |  you can create a #[code Doc] object directly. Optionally, you can also
 | ||
|     |  specify a list of boolean values, indicating whether each word has a
 | ||
|     |  subsequent space.
 | ||
| 
 | ||
| +code-exec.
 | ||
|     import spacy
 | ||
|     from spacy.tokens import Doc
 | ||
|     from spacy.lang.en import English
 | ||
| 
 | ||
|     nlp = English()
 | ||
|     doc = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'],
 | ||
|               spaces=[False, True, False, False])
 | ||
|     print([(t.text, t.text_with_ws, t.whitespace_) for t in doc])
 | ||
| 
 | ||
| p
 | ||
|     |  If provided, the spaces list must be the same length as the words list.
 | ||
|     |  The spaces list affects the #[code doc.text], #[code span.text],
 | ||
|     |  #[code token.idx], #[code span.start_char] and #[code span.end_char]
 | ||
|     |  attributes. If you don't provide a #[code spaces] sequence, spaCy will
 | ||
|     |  assume that all words are whitespace delimited.
 | ||
| 
 | ||
| +code-exec.
 | ||
|     import spacy
 | ||
|     from spacy.tokens import Doc
 | ||
|     from spacy.lang.en import English
 | ||
| 
 | ||
|     nlp = English()
 | ||
|     bad_spaces = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'])
 | ||
|     good_spaces = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'],
 | ||
|                       spaces=[False, True, False, False])
 | ||
| 
 | ||
|     print(bad_spaces.text)   # 'Hello , world !'
 | ||
|     print(good_spaces.text)  # 'Hello, world!'
 | ||
| 
 | ||
| p
 | ||
|     |  Once you have a #[+api("doc") #[code Doc]] object, you can write to its
 | ||
|     |  attributes to set the part-of-speech tags, syntactic dependencies, named
 | ||
|     |  entities and other attributes. For details, see the respective usage
 | ||
|     |  pages.
 |