mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 05:31:15 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			665 lines
		
	
	
		
			28 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			665 lines
		
	
	
		
			28 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| //- 💫 DOCS > USAGE > ADDING LANGUAGES
 | ||
| 
 | ||
| include ../../_includes/_mixins
 | ||
| 
 | ||
| p
 | ||
|     |  Adding full support for a language touches many different parts of the
 | ||
|     |  spaCy library. This guide explains how to fit everything together, and
 | ||
|     |  points you to the specific workflows for each component. Obviously,
 | ||
|     |  there are lots of ways you can organise your code when you implement
 | ||
|     |  your own #[+api("language") #[code Language]] class. This guide will
 | ||
|     |  focus on how it's done within spaCy. For full language support, we'll
 | ||
|     |  need to:
 | ||
| 
 | ||
| +list("numbers")
 | ||
|     +item
 | ||
|         |  Create a #[strong #[code Language] subclass].
 | ||
|     +item
 | ||
|         |  Define custom #[strong language data], like a stop list and tokenizer
 | ||
|         |  exceptions.
 | ||
|     +item
 | ||
|         |  #[strong Test] the new language tokenizer.
 | ||
|     +item
 | ||
|         |  #[strong Build the vocabulary], including word frequencies, Brown
 | ||
|         |  clusters and word vectors.
 | ||
|     +item
 | ||
|         |  Set up a #[strong model direcory] and #[strong train] the tagger and
 | ||
|         |  parser.
 | ||
| 
 | ||
| p
 | ||
|     |  For some languages, you may also want to develop a solution for
 | ||
|     |  lemmatization and morphological analysis.
 | ||
| 
 | ||
| +h(2, "language-subclass") Creating a #[code Language] subclass
 | ||
| 
 | ||
| p
 | ||
|     |  Language-specific code and resources should be organised into a
 | ||
|     |  subpackage of spaCy, named according to the language's
 | ||
|     |  #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code].
 | ||
|     |  For instance, code and resources specific to Spanish are placed into a
 | ||
|     |  directory #[code spacy/lang/es], which can be imported as
 | ||
|     |  #[code spacy.lang.es].
 | ||
| 
 | ||
| p
 | ||
|     |  To get started, you can use our
 | ||
|     |  #[+src(gh("spacy-dev-resources", "templates/new_language")) templates]
 | ||
|     |  for the most important files. Here's what the class template looks like:
 | ||
| 
 | ||
| +code("__init__.py (excerpt)").
 | ||
|     # import language-specific data
 | ||
|     from .stop_words import STOP_WORDS
 | ||
|     from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | ||
|     from .lex_attrs import LEX_ATTRS
 | ||
| 
 | ||
|     from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | ||
|     from ...language import Language
 | ||
|     from ...attrs import LANG
 | ||
|     from ...util import update_exc
 | ||
| 
 | ||
|     # create Defaults class in the module scope (necessary for pickling!)
 | ||
|     class XxxxxDefaults(Language.Defaults):
 | ||
|         lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | ||
|         lex_attr_getters[LANG] = lambda text: 'xx' # language ISO code
 | ||
| 
 | ||
|         # optional: replace flags with custom functions, e.g. like_num()
 | ||
|         lex_attr_getters.update(LEX_ATTRS)
 | ||
| 
 | ||
|         # merge base exceptions and custom tokenizer exceptions
 | ||
|         tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | ||
|         stop_words = set(STOP_WORDS)
 | ||
| 
 | ||
|     # create actual Language class
 | ||
|     class Xxxxx(Language):
 | ||
|         lang = 'xx' # language ISO code
 | ||
|         Defaults = XxxxxDefaults # override defaults
 | ||
| 
 | ||
|     # set default export – this allows the language class to be lazy-loaded
 | ||
|     __all__ = ['Xxxxx']
 | ||
| 
 | ||
| +aside("Why lazy-loading?")
 | ||
|     |  Some languages contain large volumes of custom data, like lemmatizer
 | ||
|     |  loopup tables, or complex regular expression that are expensive to
 | ||
|     |  compute. As of spaCy v2.0, #[code Language] classes are not imported on
 | ||
|     |  initialisation and are only loaded when you import them directly, or load
 | ||
|     |  a model that requires a language to be loaded. To lazy-load languages in
 | ||
|     |  your application, you can use the #[code util.get_lang_class()] helper
 | ||
|     |  function with the two-letter language code as its argument.
 | ||
| 
 | ||
| +h(2, "language-data") Adding language data
 | ||
| 
 | ||
| p
 | ||
|     |  Every language is full of exceptions and special cases, especially
 | ||
|     |  amongst the most common words. Some of these exceptions are shared
 | ||
|     |  between multiple languages, while others are entirely idiosyncratic.
 | ||
|     |  spaCy makes it easy to deal with these exceptions on a case-by-case
 | ||
|     |  basis, by defining simple rules and exceptions. The exceptions data is
 | ||
|     |  defined in Python the
 | ||
|     |  #[+src(gh("spacy-dev-resources", "templates/new_language")) language data],
 | ||
|     |  so that Python functions can be used to help you generalise and combine
 | ||
|     |  the data as you require.
 | ||
| 
 | ||
| p
 | ||
|     |  Here's an overview of the individual components that can be included
 | ||
|     |  in the language data. For more details on them, see the sections below.
 | ||
| 
 | ||
| +image
 | ||
|     include ../../assets/img/docs/language_data.svg
 | ||
|     .u-text-right
 | ||
|         +button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic
 | ||
| 
 | ||
| +table(["File name", "Variables", "Description"])
 | ||
|     +row
 | ||
|         +cell #[+src(gh("spacy-dev-resources", "templates/new_language/stop_words.py")) stop_words.py]
 | ||
|         +cell #[code STOP_WORDS] (set)
 | ||
|         +cell
 | ||
|             |  List of most common words. Matching tokens will return #[code True]
 | ||
|             |  for #[code is_stop].
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[+src(gh("spacy-dev-resources", "templates/new_language/tokenizer_exceptions.py")) tokenizer_exceptions.py]
 | ||
|         +cell #[code TOKENIZER_EXCEPTIONS] (dict), #[code TOKEN_MATCH] (regex)
 | ||
|         +cell
 | ||
|             |  Special-case rules for the tokenizer, for example, contractions
 | ||
|             |  and abbreviations containing punctuation.
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[+src(gh("spaCy", "spacy/lang/punctuation.py")) punctuation.py]
 | ||
|         +cell
 | ||
|             |  #[code TOKENIZER_PREFIXES], #[code TOKENIZER_SUFFIXES],
 | ||
|             |  #[code TOKENIZER_INFIXES] (dicts)
 | ||
|         +cell Regular expressions for splitting tokens, e.g. on punctuation.
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[+src(gh("spacy-dev-resources", "templates/new_language/lex_attrs.py")) lex_attrs.py]
 | ||
|         +cell #[code LEX_ATTRS] (dict)
 | ||
|         +cell
 | ||
|             |  Functions for setting lexical attributes on tokens, e.g.
 | ||
|             |  #[code is_punct] or #[code like_num].
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[+src(gh("spacy-dev-resources", "templates/new_language/lemmatizer.py")) lemmatizer.py]
 | ||
|         +cell #[code LOOKUP] (dict)
 | ||
|         +cell
 | ||
|             |  Lookup-based lemmatization table. If more lemmatizer data is
 | ||
|             |  available, it should live in #[code /lemmatizer/lookup.py].
 | ||
| 
 | ||
|     +row
 | ||
|         +cell /lemmatizer
 | ||
|         +cell #[code LEMMA_RULES], #[code LEMMA_INDEX], #[code LEMMA_EXC] (dicts)
 | ||
|         +cell Lemmatization rules, keyed by part of speech.
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[+src(gh("spacy-dev-resources", "templates/new_language/tag_map.py")) tag_map.py]
 | ||
|         +cell #[code TAG_MAP] (dict)
 | ||
|         +cell
 | ||
|             |  Dictionary mapping strings in your tag set to
 | ||
|             |  #[+a("http://universaldependencies.org/u/pos/all.html") Universal Dependencies]
 | ||
|             |  tags.
 | ||
| 
 | ||
|     +row
 | ||
|         +cell #[+src(gh()) morph_rules.py]
 | ||
|         +cell #[code MORPH_RULES] (dict)
 | ||
|         +cell Exception rules for morphological analysis of irregular words.
 | ||
| 
 | ||
| +aside("Should I ever update the global data?")
 | ||
|     |  Reuseable language data is collected as atomic pieces in the root of the
 | ||
|     |  #[+src(gh("spaCy", "lang")) spacy.lang] package. Often, when a new
 | ||
|     |  language is added, you'll find a pattern or symbol that's missing. Even
 | ||
|     |  if it isn't common in other languages, it might be best to add it to the
 | ||
|     |  shared language data, unless it has some conflicting interpretation. For
 | ||
|     |  instance, we don't expect to see guillemot quotation symbols
 | ||
|     |  (#[code »] and #[code «]) in English text. But if we do see
 | ||
|     |  them, we'd probably prefer the tokenizer to split them off.
 | ||
| 
 | ||
| +infobox("For languages with non-latin characters")
 | ||
|     |  In order for the tokenizer to split suffixes, prefixes and infixes, spaCy
 | ||
|     |  needs to know the language's character set. If the language you're adding
 | ||
|     |  uses non-latin characters, you might need to add the required character
 | ||
|     |  classes to the global
 | ||
|     |  #[+src(gh("spacy", "spacy/lang/char_classes.py")) char_classes.py].
 | ||
|     |  spaCy uses the #[+a("https://pypi.python.org/pypi/regex/") #[code regex] library]
 | ||
|     |  to keep this simple and readable. If the language requires very specific
 | ||
|     |  punctuation rules, you should consider overwriting the default regular
 | ||
|     |  expressions with your own in the language's #[code Defaults].
 | ||
| 
 | ||
| +h(3, "stop-words") Stop words
 | ||
| 
 | ||
| p
 | ||
|     |  A #[+a("https://en.wikipedia.org/wiki/Stop_words") "stop list"] is a
 | ||
|     |  classic trick from the early days of information retrieval when search
 | ||
|     |  was largely about keyword presence and absence. It is still sometimes
 | ||
|     |  useful today to filter out common words from a bag-of-words model. To
 | ||
|     |  improve readability, #[code STOP_WORDS] are separated by spaces and
 | ||
|     |  newlines, and added as a multiline string.
 | ||
| 
 | ||
| +aside("What does spaCy consider a stop word?")
 | ||
|     |  There's no particularly principled logic behind what words should be
 | ||
|     |  added to the stop list. Make a list that you think might be useful
 | ||
|     |  to people and is likely to be unsurprising. As a rule of thumb, words
 | ||
|     |  that are very rare are unlikely to be useful stop words.
 | ||
| 
 | ||
| +code("Example").
 | ||
|     STOP_WORDS = set("""
 | ||
|     a about above across after afterwards again against all almost alone along
 | ||
|     already also although always am among amongst amount an and another any anyhow
 | ||
|     anyone anything anyway anywhere are around as at
 | ||
| 
 | ||
|     back be became because become becomes becoming been before beforehand behind
 | ||
|     being below beside besides between beyond both bottom but by
 | ||
|     """).split())
 | ||
| 
 | ||
| +infobox("Important note")
 | ||
|     |  When adding stop words from an online source, always #[strong include the link]
 | ||
|     |  in a comment. Make sure to #[strong proofread] and double-check the words
 | ||
|     |  carefully. A lot of the lists available online have been passed around
 | ||
|     |  for years and often contain mistakes, like unicode errors or random words
 | ||
|     |  that have once been added for a specific use case, but don't actually
 | ||
|     |  qualify.
 | ||
| 
 | ||
| +h(3, "tokenizer-exceptions") Tokenizer exceptions
 | ||
| 
 | ||
| p
 | ||
|     |  spaCy's #[+a("/docs/usage/customizing-tokenizer#how-tokenizer-works") tokenization algorithm]
 | ||
|     |  lets you deal with whitespace-delimited chunks separately. This makes it
 | ||
|     |  easy to define special-case rules, without worrying about how they
 | ||
|     |  interact with the rest of the tokenizer. Whenever the key string is
 | ||
|     |  matched, the special-case rule is applied, giving the defined sequence of
 | ||
|     |  tokens. You can also attach attributes to the subtokens, covered by your
 | ||
|     |  special case, such as the subtokens #[code LEMMA] or #[code TAG].
 | ||
| 
 | ||
| p
 | ||
|     |  Tokenizer exceptions can be added in the following format:
 | ||
| 
 | ||
| +code("tokenizer_exceptions.py (excerpt)").
 | ||
|     TOKENIZER_EXCEPTIONS = {
 | ||
|         "don't": [
 | ||
|             {ORTH: "do", LEMMA: "do"},
 | ||
|             {ORTH: "n't", LEMMA: "not", TAG: "RB"}]
 | ||
|     }
 | ||
| 
 | ||
| +infobox("Important note")
 | ||
|     |  If an exception consists of more than one token, the #[code ORTH] values
 | ||
|     |  combined always need to #[strong match the original string]. The way the
 | ||
|     |  original string is split up can be pretty arbitrary sometimes – for
 | ||
|     |  example "gonna" is split into "gon" (lemma "go") nad "na" (lemma "to").
 | ||
|     |  Because of how the tokenizer works, it's currently not possible to split
 | ||
|     |  single-letter strings into multiple tokens.
 | ||
| 
 | ||
| p
 | ||
|     |  Unambiguous abbreviations, like month names or locations in English,
 | ||
|     |  should be added to exceptions with a lemma assigned, for example
 | ||
|     |  #[code {ORTH: "Jan.", LEMMA: "January"}]. Since the exceptions are
 | ||
|     |  added in Python, you can use custom logic to generate them more
 | ||
|     |  efficiently and make your data less verbose. How you do this ultimately
 | ||
|     |  depends on the language. Here's an example of how exceptions for time
 | ||
|     |  formats like "1a.m." and "1am" are generated in the English
 | ||
|     |  #[+src(gh("spaCy", "spacy/en/lang/tokenizer_exceptions.py")) tokenizer_exceptions.py]:
 | ||
| 
 | ||
| +code("tokenizer_exceptions.py (excerpt)").
 | ||
|     # use short, internal variable for readability
 | ||
|     _exc = {}
 | ||
| 
 | ||
|     for h in range(1, 12 + 1):
 | ||
|         for period in ["a.m.", "am"]:
 | ||
|             # always keep an eye on string interpolation!
 | ||
|             _exc["%d%s" % (h, period)] = [
 | ||
|                 {ORTH: "%d" % h},
 | ||
|                 {ORTH: period, LEMMA: "a.m."}]
 | ||
|         for period in ["p.m.", "pm"]:
 | ||
|             _exc["%d%s" % (h, period)] = [
 | ||
|                 {ORTH: "%d" % h},
 | ||
|                 {ORTH: period, LEMMA: "p.m."}]
 | ||
| 
 | ||
|     # only declare this at the bottom
 | ||
|     TOKENIZER_EXCEPTIONS = dict(_exc)
 | ||
| 
 | ||
| +aside("Generating tokenizer exceptions")
 | ||
|     |  Keep in mind that generating exceptions only makes sense if there's a
 | ||
|     |  clearly defined and #[strong finite number] of them, like common
 | ||
|     |  contractions in English. This is not always the case – in Spanish for
 | ||
|     |  instance, infinitive or imperative reflexive verbs and pronouns are one
 | ||
|     |  token (e.g. "vestirme"). In cases like this, spaCy shouldn't be
 | ||
|     |  generating exceptions for #[em all verbs]. Instead, this will be handled
 | ||
|     |  at a later stage during lemmatization.
 | ||
| 
 | ||
| p
 | ||
|     |  When adding the tokenizer exceptions to the #[code Defaults], you can use
 | ||
|     |  the #[code update_exc()] helper function to merge them with the global
 | ||
|     |  base  exceptions (including one-letter abbreviations and emoticons).
 | ||
|     |  The function performs a basic check to make sure exceptions are
 | ||
|     |  provided in the correct format. It can take any number of exceptions
 | ||
|     |  dicts as its arguments, and will update and overwrite the exception in
 | ||
|     |  this order. For example, if your language's tokenizer exceptions include
 | ||
|     |  a custom tokenization pattern for "a.", it will overwrite the base
 | ||
|     |  exceptions with the language's custom one.
 | ||
| 
 | ||
| +code("Example").
 | ||
|     from ...util import update_exc
 | ||
| 
 | ||
|     BASE_EXCEPTIONS =  {"a.": [{ORTH: "a."}], ":)": [{ORTH: ":)"}]}
 | ||
|     TOKENIZER_EXCEPTIONS = {"a.": [{ORTH: "a.", LEMMA: "all"}]}
 | ||
| 
 | ||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | ||
|     # {"a.": [{ORTH: "a.", LEMMA: "all"}], ":)": [{ORTH: ":)"}]}
 | ||
| 
 | ||
| //-+aside("About spaCy's custom pronoun lemma")
 | ||
|     |  Unlike verbs and common nouns, there's no clear base form of a personal
 | ||
|     |  pronoun. Should the lemma of "me" be "I", or should we normalize person
 | ||
|     |  as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
 | ||
|     |  novel symbol, #[code.u-nowrap -PRON-], which is used as the lemma for
 | ||
|     |  all personal pronouns.
 | ||
| 
 | ||
| +h(3, "lex-attrs") Lexical attributes
 | ||
| 
 | ||
| p
 | ||
|     |  spaCy provides a range of #[+api("token#attributes") #[code Token] attributes]
 | ||
|     |  that return useful information on that token – for example, whether it's
 | ||
|     |  uppercase or lowercase, a left or right punctuation mark, or whether it
 | ||
|     |  resembles a number or email address. Most of these functions, like
 | ||
|     |  #[code is_lower] or #[code like_url] should be language-independent.
 | ||
|     |  Others, like #[code like_num] (which includes both digits and number
 | ||
|     |  words), requires some customisation.
 | ||
| 
 | ||
| +aside("Best practices")
 | ||
|     |  Keep in mind that those functions are only intended to be  an approximation.
 | ||
|     |  It's always better to prioritise simplicity and performance over covering
 | ||
|     |  very specific edge cases.#[br]#[br]
 | ||
|     |  English number words are pretty simple, because even large numbers
 | ||
|     |  consist of individual tokens, and we can get away with splitting and
 | ||
|     |  matching strings against a list. In other languages, like German, "two
 | ||
|     |  hundred and thirty-four" is one word, and thus one token. Here, it's best
 | ||
|     |  to match a string against a list of number word fragments (instead of a
 | ||
|     |  technically almost infinite list of possible number words).
 | ||
| 
 | ||
| p
 | ||
|     |  Here's an example from the English
 | ||
|     |  #[+src(gh("spaCy", "spacy/en/lang/lex_attrs.py")) lex_attrs.py]:
 | ||
| 
 | ||
| +code("lex_attrs.py").
 | ||
|     _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
 | ||
|                   'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen',
 | ||
|                   'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty',
 | ||
|                   'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety',
 | ||
|                   'hundred', 'thousand', 'million', 'billion', 'trillion', 'quadrillion',
 | ||
|                   'gajillion', 'bazillion']
 | ||
| 
 | ||
|     def like_num(text):
 | ||
|         text = text.replace(',', '').replace('.', '')
 | ||
|         if text.isdigit():
 | ||
|             return True
 | ||
|         if text.count('/') == 1:
 | ||
|             num, denom = text.split('/')
 | ||
|             if num.isdigit() and denom.isdigit():
 | ||
|                 return True
 | ||
|         if text in _num_words:
 | ||
|             return True
 | ||
|         return False
 | ||
| 
 | ||
|     LEX_ATTRS = {
 | ||
|         LIKE_NUM: like_num
 | ||
|     }
 | ||
| 
 | ||
| p
 | ||
|     |  By updating the default lexical attributes with a custom #[code LEX_ATTRS]
 | ||
|     |  dictionary in the language's defaults via
 | ||
|     |  #[code lex_attr_getters.update(LEX_ATTRS)], only the new custom functions
 | ||
|     |  are overwritten.
 | ||
| 
 | ||
| +h(3, "lemmatizer") Lemmatizer
 | ||
| 
 | ||
| p
 | ||
|     |  As of v2.0, spaCy supports simple lookup-based lemmatization. This is
 | ||
|     |  usually the quickest and easiest way to get started. The data is stored
 | ||
|     |  in a dictionary mapping a string to its lemma. To determine a token's
 | ||
|     |  lemma, spaCy simply looks it up in the table. Here's an example from
 | ||
|     |  the Spanish language data:
 | ||
| 
 | ||
| +code("lang/es/lemmatizer.py (excerpt)").
 | ||
|     LOOKUP = {
 | ||
|         "aba": "abar",
 | ||
|         "ababa": "abar",
 | ||
|         "ababais": "abar",
 | ||
|         "ababan": "abar",
 | ||
|         "ababanes": "ababán",
 | ||
|         "ababas": "abar",
 | ||
|         "ababoles": "ababol",
 | ||
|         "ababábites": "ababábite"
 | ||
|     }
 | ||
| 
 | ||
| p
 | ||
|     |  To add a lookup lemmatizer to your language, import the #[code LOOKUP]
 | ||
|     |  table and #[code Lemmatizer], and create a new classmethod:
 | ||
| 
 | ||
| 
 | ||
| +code("__init__py (excerpt)").
 | ||
|     # other imports here, plus lookup table and lookup lemmatizer
 | ||
|     from .lemmatizer import LOOKUP
 | ||
|     from ...lemmatizerlookup import Lemmatizer
 | ||
| 
 | ||
|     class Xxxxx(Language):
 | ||
|         lang = 'xx'
 | ||
| 
 | ||
|         class Defaults(Language.Defaults):
 | ||
|             # other language defaults here
 | ||
| 
 | ||
|             @classmethod
 | ||
|             def create_lemmatizer(cls, nlp=None):
 | ||
|                 return Lemmatizer(LOOKUP)
 | ||
| 
 | ||
| +h(3, "tag-map") Tag map
 | ||
| 
 | ||
| p
 | ||
|     |  Most treebanks define a custom part-of-speech tag scheme, striking a
 | ||
|     |  balance between level of detail and ease of prediction.  While it's
 | ||
|     |  useful to have custom tagging schemes, it's also useful to have a common
 | ||
|     |  scheme, to which the more specific tags can be related. The tagger can
 | ||
|     |  learn a tag scheme with any arbitrary symbols. However, you need to
 | ||
|     |  define how those symbols map down to the
 | ||
|     |  #[+a("http://universaldependencies.org/u/pos/all.html") Universal Dependencies tag set].
 | ||
|     |  This is done by providing a tag map.
 | ||
| 
 | ||
| p
 | ||
|     |  The keys of the tag map should be #[strong strings in your tag set]. The
 | ||
|     |  values should be a dictionary. The dictionary must have an entry POS
 | ||
|     |  whose value is one of the
 | ||
|     |  #[+a("http://universaldependencies.org/u/pos/all.html") Universal Dependencies]
 | ||
|     |  tags. Optionally, you can also include morphological features or other
 | ||
|     |  token attributes in the tag map as well. This allows you to do simple
 | ||
|     |  #[+a("/docs/usage/pos-tagging#rule-based-morphology") rule-based morphological analysis].
 | ||
| 
 | ||
| +code("Example").
 | ||
|     from ..symbols import POS, NOUN, VERB, DET
 | ||
| 
 | ||
|     TAG_MAP = {
 | ||
|         "NNS":  {POS: NOUN, "Number": "plur"},
 | ||
|         "VBG":  {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
 | ||
|         "DT":   {POS: DET}
 | ||
|     }
 | ||
| 
 | ||
| +h(3, "morph-rules") Morph rules
 | ||
| 
 | ||
| +under-construction
 | ||
| 
 | ||
| +h(2, "testing") Testing the new language tokenizer
 | ||
| 
 | ||
| p
 | ||
|     |  Before using the new language or submitting a
 | ||
|     |  #[+a(gh("spaCy") + "/pulls") pull request] to spaCy, you should make sure
 | ||
|     |  it works as expected. This is especially important if you've added custom
 | ||
|     |  regular expressions for token matching or punctuation – you don't want to
 | ||
|     |  be causing regressions.
 | ||
| 
 | ||
| +aside("spaCy's test suite")
 | ||
|     |  spaCy uses the #[+a("https://docs.pytest.org/en/latest/") pytest framework]
 | ||
|     |  for testing. For more details on how the tests are structured and best
 | ||
|     |  practices for writing your own tests, see our
 | ||
|     |  #[+a(gh("spaCy", "spacy/tests")) tests documentation].
 | ||
| 
 | ||
| +h(3, "testing-tokenizer") Testing the basic tokenizer
 | ||
| 
 | ||
| p
 | ||
|     |  The easiest way to test your new tokenizer is to run the
 | ||
|     |  language-independent "tokenizer sanity" tests located in
 | ||
|     |  #[+src(gh("spaCy", "spacy/tests/tokenizer")) tests/tokenizer]. This will
 | ||
|     |  test for basic behaviours like punctuation splitting, URL matching and
 | ||
|     |  correct handling of whitespace. In the
 | ||
|     |  #[+src(gh("spaCy", "spacy/tests/conftest.py")) conftest.py], add the new
 | ||
|     |  language ID to the list of #[code _languages]:
 | ||
| 
 | ||
| +code.
 | ||
|     _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
 | ||
|                   'nl', 'pl', 'pt', 'sv', 'xx'] # new language here
 | ||
| 
 | ||
| +aside-code("Global tokenizer test example").
 | ||
|     # use fixture by adding it as an argument
 | ||
|     def test_with_all_languages(tokenizer):
 | ||
|         # will be performed on ALL language tokenizers
 | ||
|         tokens = tokenizer(u'Some text here.')
 | ||
| 
 | ||
| p
 | ||
|     |  The language will now be included in the #[code tokenizer] test fixture,
 | ||
|     |  which is used by the basic tokenizer tests. If you want to add your own
 | ||
|     |  tests that should be run over all languages, you can use this fixture as
 | ||
|     |  an argument of your test function.
 | ||
| 
 | ||
| +h(3, "testing-custom") Writing language-specific tests
 | ||
| 
 | ||
| p
 | ||
|     |  It's recommended to always add at least some tests with examples specific
 | ||
|     |  to the language. Language tests should be located in
 | ||
|     |  #[+src(gh("spaCy", "spacy/tests/lang")) tests/lang] in a directory named
 | ||
|     |  after the language ID. You'll also need to create a fixture for your
 | ||
|     |  tokenizer in the #[+src(gh("spaCy", "spacy/tests/conftest.py")) conftest.py].
 | ||
|     |  Always use the #[code get_lang_class()] helper function within the fixture,
 | ||
|     |  instead of importing the class at the top of the file. This will load the
 | ||
|     |  language data only when it's needed. (Otherwise, #[em all data] would be
 | ||
|     |  loaded every time you run a test.)
 | ||
| 
 | ||
| +code.
 | ||
|     @pytest.fixture
 | ||
|     def en_tokenizer():
 | ||
|         return util.get_lang_class('en').Defaults.create_tokenizer()
 | ||
| 
 | ||
| p
 | ||
|     |  When adding test cases, always
 | ||
|     |  #[+a(gh("spaCy", "spacy/tests#parameters")) #[code parametrize]] them –
 | ||
|     |  this will make it easier for others to add more test cases without having
 | ||
|     |  to modify the test itself. You can also add parameter tuples, for example,
 | ||
|     |  a test sentence and its expected length, or a list of expected tokens.
 | ||
|     |  Here's an example of an English tokenizer test for combinations of
 | ||
|     |  punctuation and abbreviations:
 | ||
| 
 | ||
| +code("Example test").
 | ||
|     @pytest.mark.parametrize('text,length', [
 | ||
|         ("The U.S. Army likes Shock and Awe.", 8),
 | ||
|         ("U.N. regulations are not a part of their concern.", 10),
 | ||
|         ("“Isn't it?”", 6)])
 | ||
|     def test_en_tokenizer_handles_punct_abbrev(en_tokenizer, text, length):
 | ||
|         tokens = en_tokenizer(text)
 | ||
|         assert len(tokens) == length
 | ||
| 
 | ||
| +h(2, "vocabulary") Building the vocabulary
 | ||
| 
 | ||
| p
 | ||
|     |  spaCy expects that common words will be cached in a
 | ||
|     |  #[+api("vocab") #[code Vocab]] instance. The vocabulary caches lexical
 | ||
|     |  features, and makes it easy to use information from unlabelled text
 | ||
|     |  samples in your models. Specifically, you'll usually want to collect
 | ||
|     |  word frequencies, and train two types of distributional similarity model:
 | ||
|     |  Brown clusters, and word vectors. The Brown clusters are used as features
 | ||
|     |  by linear models, while the word vectors are useful for lexical
 | ||
|     |  similarity models and deep learning.
 | ||
| 
 | ||
| +h(3, "word-frequencies") Word frequencies
 | ||
| 
 | ||
| p
 | ||
|     |  To generate the word frequencies from a large, raw corpus, you can use the
 | ||
|     |  #[+src(gh("spacy-dev-resources", "training/word_freqs.py")) word_freqs.py]
 | ||
|     |  script from the spaCy developer resources. Note that your corpus should
 | ||
|     |  not be preprocessed (i.e. you need punctuation for example). The
 | ||
|     |  #[+api("cli#model") #[code model]] command expects a tab-separated word
 | ||
|     |  frequencies file with three columns:
 | ||
| 
 | ||
| +list("numbers")
 | ||
|     +item The number of times the word occurred in your language sample.
 | ||
|     +item The number of distinct documents the word occurred in.
 | ||
|     +item The word itself.
 | ||
| 
 | ||
| p
 | ||
|     |  An example word frequencies file could look like this:
 | ||
| 
 | ||
| +code("es_word_freqs.txt", "text").
 | ||
|     6361109	111	Aunque
 | ||
|     23598543	111	aunque
 | ||
|     10097056	111	claro
 | ||
|     193454	111	aro
 | ||
|     7711123	111	viene
 | ||
|     12812323	111	mal
 | ||
|     23414636	111	momento
 | ||
|     2014580	111	felicidad
 | ||
|     233865	111	repleto
 | ||
|     15527	111	eto
 | ||
|     235565	111	deliciosos
 | ||
|     17259079	111	buena
 | ||
|     71155	111	Anímate
 | ||
|     37705	111	anímate
 | ||
|     33155	111	cuéntanos
 | ||
|     2389171	111	cuál
 | ||
|     961576	111	típico
 | ||
| 
 | ||
| p
 | ||
|     |  You should make sure you use the spaCy tokenizer for your
 | ||
|     |  language to segment the text for your word frequencies. This will ensure
 | ||
|     |  that the frequencies refer to the same segmentation standards you'll be
 | ||
|     |  using at run-time. For instance, spaCy's English tokenizer segments
 | ||
|     |  "can't" into two tokens. If we segmented the text by whitespace to
 | ||
|     |  produce the frequency counts, we'll have incorrect frequency counts for
 | ||
|     |  the tokens "ca" and "n't".
 | ||
| 
 | ||
| +h(3, "brown-clusters") Training the Brown clusters
 | ||
| 
 | ||
| p
 | ||
|     |  spaCy's tagger, parser and entity recognizer are designed to use
 | ||
|     |  distributional similarity features provided by the
 | ||
|     |  #[+a("https://github.com/percyliang/brown-cluster") Brown clustering algorithm].
 | ||
|     |  You should train a model with between 500 and 1000 clusters. A minimum
 | ||
|     |  frequency threshold of 10 usually works well.
 | ||
| 
 | ||
| p
 | ||
|     |  An example clusters file could look like this:
 | ||
| 
 | ||
| +code("es_clusters.data", "text").
 | ||
|     0000	Vestigial	1
 | ||
|     0000	Vesturland	1
 | ||
|     0000	Veyreau	1
 | ||
|     0000	Veynes	1
 | ||
|     0000	Vexilografía	1
 | ||
|     0000	Vetrigne	1
 | ||
|     0000	Vetónica	1
 | ||
|     0000	Asunden	1
 | ||
|     0000	Villalambrús	1
 | ||
|     0000	Vichuquén	1
 | ||
|     0000	Vichtis	1
 | ||
|     0000	Vichigasta	1
 | ||
|     0000	VAAH	1
 | ||
|     0000	Viciebsk	1
 | ||
|     0000	Vicovaro	1
 | ||
|     0000	Villardeveyo	1
 | ||
|     0000	Vidala	1
 | ||
|     0000	Videoguard	1
 | ||
|     0000	Vedás	1
 | ||
|     0000	Videocomunicado	1
 | ||
|     0000	VideoCrypt	1
 | ||
| 
 | ||
| +h(3, "word-vectors") Training the word vectors
 | ||
| 
 | ||
| p
 | ||
|     |  #[+a("https://en.wikipedia.org/wiki/Word2vec") Word2vec] and related
 | ||
|     |  algorithms let you train useful word similarity models from unlabelled
 | ||
|     |  text. This is a key part of using
 | ||
|     |  #[+a("/docs/usage/deep-learning") deep learning] for NLP with limited
 | ||
|     |  labelled data. The vectors are also useful by themselves – they power
 | ||
|     |  the #[code .similarity()] methods in spaCy. For best results, you should
 | ||
|     |  pre-process the text with spaCy before training the Word2vec model. This
 | ||
|     |  ensures your tokenization will match.
 | ||
| 
 | ||
| p
 | ||
|     | You can use our
 | ||
|     |  #[+src(gh("spacy-dev-resources", "training/word_vectors.py")) word vectors training script],
 | ||
|     |  which pre-processes the text with your language-specific tokenizer and
 | ||
|     |  trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim].
 | ||
|     |  The #[code vectors.bin] file should consist of one word and vector per line.
 | ||
| 
 | ||
| //-+aside-code("your_data_directory", "yaml").
 | ||
|     ├── vocab/
 | ||
|     |   ├── lexemes.bin
 | ||
|     |   ├── strings.json
 | ||
|     |   └── oov_prob
 | ||
|     ├── pos/
 | ||
|     |   ├── model
 | ||
|     |   └── config.json
 | ||
|     ├── deps/
 | ||
|     |   ├── model
 | ||
|     |   └── config.json
 | ||
|     └── ner/
 | ||
|         ├── model
 | ||
|         └── config.json
 | ||
| 
 | ||
| +h(2, "train-tagger-parser") Training the tagger and parser
 | ||
| 
 | ||
| p
 | ||
|     |  You can now train the model using a corpus for your language annotated
 | ||
|     |  with #[+a("http://universaldependencies.org/") Universal Dependencies].
 | ||
|     |  If your corpus uses the
 | ||
|     |  #[+a("http://universaldependencies.org/docs/format.html") CoNLL-U] format,
 | ||
|     |  i.e. files with the extension #[code .conllu], you can use the
 | ||
|     |  #[+api("cli#convert") #[code convert]] command to convert it to spaCy's
 | ||
|     |  #[+a("/docs/api/annotation#json-input") JSON format] for training.
 | ||
| 
 | ||
| p
 | ||
|     |  Once you have your UD corpus transformed into JSON, you can train your
 | ||
|     |  model use the using spaCy's #[+api("cli#train") #[code train]] command:
 | ||
| 
 | ||
| +code(false, "bash").
 | ||
|     python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
 |