diff --git a/website/docs/usage/_spacy-101/_language-data.jade b/website/docs/usage/_spacy-101/_language-data.jade new file mode 100644 index 000000000..977a9e2f8 --- /dev/null +++ b/website/docs/usage/_spacy-101/_language-data.jade @@ -0,0 +1,101 @@ +//- 💫 DOCS > USAGE > SPACY 101 > LANGUAGE DATA + +p + | Every language is different – and usually full of + | #[strong exceptions and special cases], especially amongst the most + | common words. Some of these exceptions are shared across languages, while + | others are #[strong entirely specific] – usually so specific that they need + | to be hard-coded. The #[+src(gh("spaCy", "spacy/lang")) /lang] module + | contains all language-specific data, organised in simple Python files. + | This makes the data easy to update and extend. + +p + | The #[strong shared language data] in the directory root includes rules + | that can be generalised across languages – for example, rules for basic + | punctuation, emoji, emoticons, single-letter abbreviations and norms for + | equivalent tokens with different spellings, like #[code "] and + | #[code ”]. This helps the models make more accurate predictions. + | The #[strong individual language data] in a submodule contains + | rules that are only relevant to a particular language. It also takes + | care of putting together all components and creating the #[code Language] + | subclass – for example, #[code English] or #[code German]. + ++aside-code. + from spacy.lang.en import English + from spacy.lang.en import German + + nlp_en = English() # includes English data + nlp_de = German() # includes German data + ++image + include ../../../assets/img/docs/language_data.svg + .u-text-right + +button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic + ++table(["Name", "Description"]) + +row + +cell #[strong Stop words]#[br] + | #[+src(gh("spacy-dev-resources", "templates/new_language/stop_words.py")) stop_words.py] + +cell + | List of most common words of a language that are often useful to + | filter out, for example "and" or "I". Matching tokens will + | return #[code True] for #[code is_stop]. + + +row + +cell #[strong Tokenizer exceptions]#[br] + | #[+src(gh("spacy-dev-resources", "templates/new_language/tokenizer_exceptions.py")) tokenizer_exceptions.py] + +cell + | Special-case rules for the tokenizer, for example, contractions + | like "can't" and abbreviations with punctuation, like "U.K.". + + +row + +cell #[strong Norm exceptions] + | #[+src(gh("spaCy", "spacy/lang/norm_exceptions.py")) norm_exceptions.py] + +cell + | Special-case rules for normalising tokens to improve the model's + | predictions, for example on American vs. British spelling. + + +row + +cell #[strong Punctuation rules] + | #[+src(gh("spaCy", "spacy/lang/punctuation.py")) punctuation.py] + +cell + | Regular expressions for splitting tokens, e.g. on punctuation or + | special characters like emoji. Includes rules for prefixes, + | suffixes and infixes. + + +row + +cell #[strong Character classes] + | #[+src(gh("spaCy", "spacy/lang/char_classes.py")) char_classes.py] + +cell + | Character classes to be used in regular expressions, for example, + | latin characters, quotes, hyphens or icons. + + +row + +cell #[strong Lexical attributes] + | #[+src(gh("spacy-dev-resources", "templates/new_language/lex_attrs.py")) lex_attrs.py] + +cell + | Custom functions for setting lexical attributes on tokens, e.g. + | #[code like_num], which includes language-specific words like "ten" + | or "hundred". + + +row + +cell #[strong Lemmatizer] + | #[+src(gh("spacy-dev-resources", "templates/new_language/lemmatizer.py")) lemmatizer.py] + +cell + | Lemmatization rules or a lookup-based lemmatization table to + | assign base forms, for example "be" for "was". + + +row + +cell #[strong Tag map]#[br] + | #[+src(gh("spacy-dev-resources", "templates/new_language/tag_map.py")) tag_map.py] + +cell + | Dictionary mapping strings in your tag set to + | #[+a("http://universaldependencies.org/u/pos/all.html") Universal Dependencies] + | tags. + + +row + +cell #[strong Morph rules] + | #[+src(gh("spaCy", "spacy/lang/en/morph_rules.py")) morph_rules.py] + +cell + | Exception rules for morphological analysis of irregular words like + | personal pronouns. diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index c900734d4..90d5668d2 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -26,9 +26,9 @@ p | lemmatization and morphological analysis. +table-of-contents + +item #[+a("#101") Language data 101] +item #[+a("#language-subclass") The Language subclass] - +item #[+a("#language-data") Adding language data] - +item #[+a("#stop-workds") Stop words] + +item #[+a("#stop-words") Stop words] +item #[+a("#tokenizer-exceptions") Tokenizer exceptions] +item #[+a("#norm-exceptions") Norm exceptions] +item #[+a("#lex-attrs") Lexical attributes] @@ -49,6 +49,106 @@ p | rebuild anything in between – you can simply make edits and reload spaCy | to test them. ++h(2, "101") Language data 101 + +include _spacy-101/_language-data + +p + | The individual components #[strong expose variables] that can be imported + | within a language module, and added to the language's #[code Defaults]. + | Some components, like the punctuation rules, usually don't need much + | customisation and can simply be imported from the global rules. Others, + | like the tokenizer and norm exceptions, are very specific and will make + | a big difference to spaCy's performance on the particular language and + | training a language model. + + ++table(["Variable", "Type", "Description"]) + +row + +cell #[code STOP_WORDS] + +cell set + +cell Individual words. + + +row + +cell #[code TOKENIZER_EXCEPTIONS] + +cell dict + +cell Keyed by strings mapped to list of one dict per token with token attributes. + + +row + +cell #[code TOKEN_MATCH] + +cell regex + +cell Regexes to match complex tokens, e.g. URLs. + + +row + +cell #[code NORM_EXCEPTIONS] + +cell dict + +cell Keyed by strings, mapped to their norms. + + +row + +cell #[code TOKENIZER_PREFIXES] + +cell list + +cell Strings or regexes, usually not customised. + + +row + +cell #[code TOKENIZER_SUFFIXES] + +cell list + +cell Strings or regexes, usually not customised. + + +row + +cell #[code TOKENIZER_INFIXES] + +cell list + +cell Strings or regexes, usually not customised. + + +row + +cell #[code LEX_ATTRS] + +cell dict + +cell Attribute ID mapped to function. + + +row + +cell #[code LOOKUP] + +cell dict + +cell Keyed by strings mapping to their lemma. + + +row + +cell #[code LEMMA_RULES], #[code LEMMA_INDEX], #[code LEMMA_EXC] + +cell dict + +cell Lemmatization rules, keyed by part of speech. + + +row + +cell #[code TAG_MAP] + +cell dict + +cell + | Keyed by strings mapped to + | #[+a("http://universaldependencies.org/u/pos/all.html") Universal Dependencies] + | tags. + + +row + +cell #[code MORPH_RULES] + +cell dict + +cell Keyed by strings mapped to a dict of their morphological features. + ++aside("Should I ever update the global data?") + | Reuseable language data is collected as atomic pieces in the root of the + | #[+src(gh("spaCy", "lang")) spacy.lang] package. Often, when a new + | language is added, you'll find a pattern or symbol that's missing. Even + | if it isn't common in other languages, it might be best to add it to the + | shared language data, unless it has some conflicting interpretation. For + | instance, we don't expect to see guillemot quotation symbols + | (#[code »] and #[code «]) in English text. But if we do see + | them, we'd probably prefer the tokenizer to split them off. + ++infobox("For languages with non-latin characters") + | In order for the tokenizer to split suffixes, prefixes and infixes, spaCy + | needs to know the language's character set. If the language you're adding + | uses non-latin characters, you might need to add the required character + | classes to the global + | #[+src(gh("spacy", "spacy/lang/char_classes.py")) char_classes.py]. + | spaCy uses the #[+a("https://pypi.python.org/pypi/regex/") #[code regex] library] + | to keep this simple and readable. If the language requires very specific + | punctuation rules, you should consider overwriting the default regular + | expressions with your own in the language's #[code Defaults]. + + +h(2, "language-subclass") Creating a #[code Language] subclass p @@ -95,7 +195,7 @@ p # set default export – this allows the language class to be lazy-loaded __all__ = ['Xxxxx'] -+aside("Why lazy-loading?") ++infobox("Why lazy-loading?") | Some languages contain large volumes of custom data, like lemmatizer | loopup tables, or complex regular expression that are expensive to | compute. As of spaCy v2.0, #[code Language] classes are not imported on @@ -105,111 +205,6 @@ p | #[+api("util#get_lang_class") #[code util.get_lang_class()]] helper | function with the two-letter language code as its argument. -+h(2, "language-data") Adding language data - -p - | Every language is full of exceptions and special cases, especially - | amongst the most common words. Some of these exceptions are shared - | between multiple languages, while others are entirely idiosyncratic. - | spaCy makes it easy to deal with these exceptions on a case-by-case - | basis, by defining simple rules and exceptions. The exceptions data is - | defined in Python the - | #[+src(gh("spacy-dev-resources", "templates/new_language")) language data], - | so that Python functions can be used to help you generalise and combine - | the data as you require. - -p - | Here's an overview of the individual components that can be included - | in the language data. For more details on them, see the sections below. - -+image - include ../../assets/img/docs/language_data.svg - .u-text-right - +button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic - -+table(["File name", "Variables", "Description"]) - +row - +cell #[+src(gh("spacy-dev-resources", "templates/new_language/stop_words.py")) stop_words.py] - +cell #[code STOP_WORDS] (set) - +cell - | List of most common words. Matching tokens will return #[code True] - | for #[code is_stop]. - - +row - +cell #[+src(gh("spacy-dev-resources", "templates/new_language/tokenizer_exceptions.py")) tokenizer_exceptions.py] - +cell #[code TOKENIZER_EXCEPTIONS] (dict), #[code TOKEN_MATCH] (regex) - +cell - | Special-case rules for the tokenizer, for example, contractions - | and abbreviations containing punctuation. - - +row - +cell #[+src(gh("spaCy", "spacy/lang/norm_exceptions.py")) norm_exceptions.py] - +cell - | #[code NORM_EXCEPTIONS] (dict) - +cell - | Special-case rules for normalising tokens and assigning norms, - | for example American vs. British spelling. - - +row - +cell #[+src(gh("spaCy", "spacy/lang/punctuation.py")) punctuation.py] - +cell - | #[code TOKENIZER_PREFIXES], #[code TOKENIZER_SUFFIXES], - | #[code TOKENIZER_INFIXES] (dicts) - +cell Regular expressions for splitting tokens, e.g. on punctuation. - - +row - +cell #[+src(gh("spacy-dev-resources", "templates/new_language/lex_attrs.py")) lex_attrs.py] - +cell #[code LEX_ATTRS] (dict) - +cell - | Functions for setting lexical attributes on tokens, e.g. - | #[code is_punct] or #[code like_num]. - - +row - +cell #[+src(gh("spacy-dev-resources", "templates/new_language/lemmatizer.py")) lemmatizer.py] - +cell #[code LOOKUP] (dict) - +cell - | Lookup-based lemmatization table. If more lemmatizer data is - | available, it should live in #[code /lemmatizer/lookup.py]. - - +row - +cell /lemmatizer - +cell #[code LEMMA_RULES], #[code LEMMA_INDEX], #[code LEMMA_EXC] (dicts) - +cell Lemmatization rules, keyed by part of speech. - - +row - +cell #[+src(gh("spacy-dev-resources", "templates/new_language/tag_map.py")) tag_map.py] - +cell #[code TAG_MAP] (dict) - +cell - | Dictionary mapping strings in your tag set to - | #[+a("http://universaldependencies.org/u/pos/all.html") Universal Dependencies] - | tags. - - +row - +cell #[+src(gh()) morph_rules.py] - +cell #[code MORPH_RULES] (dict) - +cell Exception rules for morphological analysis of irregular words. - -+aside("Should I ever update the global data?") - | Reuseable language data is collected as atomic pieces in the root of the - | #[+src(gh("spaCy", "lang")) spacy.lang] package. Often, when a new - | language is added, you'll find a pattern or symbol that's missing. Even - | if it isn't common in other languages, it might be best to add it to the - | shared language data, unless it has some conflicting interpretation. For - | instance, we don't expect to see guillemot quotation symbols - | (#[code »] and #[code «]) in English text. But if we do see - | them, we'd probably prefer the tokenizer to split them off. - -+infobox("For languages with non-latin characters") - | In order for the tokenizer to split suffixes, prefixes and infixes, spaCy - | needs to know the language's character set. If the language you're adding - | uses non-latin characters, you might need to add the required character - | classes to the global - | #[+src(gh("spacy", "spacy/lang/char_classes.py")) char_classes.py]. - | spaCy uses the #[+a("https://pypi.python.org/pypi/regex/") #[code regex] library] - | to keep this simple and readable. If the language requires very specific - | punctuation rules, you should consider overwriting the default regular - | expressions with your own in the language's #[code Defaults]. - +h(3, "stop-words") Stop words p diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 03897600d..4f2642af0 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -44,11 +44,12 @@ p +item #[+a("#annotations-token") Tokenization] +item #[+a("#annotations-pos-deps") POS tags and dependencies] +item #[+a("#annotations-ner") Named entities] - +item #[+a("#vectors-similarity") Word vectos and similarity] + +item #[+a("#vectors-similarity") Word vectors and similarity] +item #[+a("#pipelines") Pipelines] +item #[+a("#vocab") Vocab, hashes and lexemes] +item #[+a("#serialization") Serialization] +item #[+a("#training") Training] + +item #[+a("#language-data") Language data] +item #[+a("#architecture") Architecture] +item #[+a("#community") Community & FAQ] @@ -255,6 +256,16 @@ include _spacy-101/_training | see the usage guides on #[+a("/docs/usage/training") training] and | #[+a("/docs/usage/training-ner") training the named entity recognizer]. ++h(2, "language-data") Language data + +include _spacy-101/_language-data + ++infobox + | To learn more about the individual components of the language data and + | how to #[strong add a new language] to spaCy in preparation for training + | a language model, see the usage guide on + | #[+a("/docs/usage/adding-languages") adding languages]. + +h(2, "architecture") Architecture +under-construction