Resolve lex_attr_getters conflict

This commit is contained in:
Matthew Honnibal 2017-06-03 16:12:01 -05:00
commit 7ca215bc26
17 changed files with 194 additions and 74 deletions

View File

@ -13,7 +13,7 @@ from thinc import describe
from thinc.describe import Dimension, Synapses, Biases, Gradient from thinc.describe import Dimension, Synapses, Biases, Gradient
from thinc.neural._classes.affine import _set_dimensions_if_needed from thinc.neural._classes.affine import _set_dimensions_if_needed
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP from .attrs import ID, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
from .tokens.doc import Doc from .tokens.doc import Doc
import numpy import numpy
@ -131,14 +131,14 @@ class PrecomputableMaxouts(Model):
return Yfp, backward return Yfp, backward
def Tok2Vec(width, embed_size, preprocess=None): def Tok2Vec(width, embed_size, preprocess=None):
cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE] cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
lower = get_col(cols.index(LOWER)) >> HashEmbed(width, embed_size, name='embed_lower') norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower')
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix') prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix') suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape') shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape')
embed = (lower | prefix | suffix | shape ) embed = (norm | prefix | suffix | shape )
tok2vec = ( tok2vec = (
with_flatten( with_flatten(
asarray(Model.ops, dtype='uint64') asarray(Model.ops, dtype='uint64')
@ -148,7 +148,7 @@ def Tok2Vec(width, embed_size, preprocess=None):
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)), >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
pad=4, ndim=5) pad=4)
) )
if preprocess not in (False, None): if preprocess not in (False, None):
tok2vec = preprocess >> tok2vec tok2vec = preprocess >> tok2vec
@ -243,7 +243,7 @@ def zero_init(model):
def doc2feats(cols=None): def doc2feats(cols=None):
cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE] cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
def forward(docs, drop=0.): def forward(docs, drop=0.):
feats = [] feats = []
for doc in docs: for doc in docs:

View File

@ -5,14 +5,16 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class DanishDefaults(Language.Defaults): class DanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'da' lex_attr_getters[LANG] = lambda text: 'da'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -20,7 +20,7 @@ class GermanDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'de' lex_attr_getters[LANG] = lambda text: 'de'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
BASE_NORMS, NORM_EXCEPTIONS) NORM_EXCEPTIONS, BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP) tag_map = dict(TAG_MAP)

View File

@ -7,15 +7,17 @@ from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class SpanishDefaults(Language.Defaults): class SpanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'es' lex_attr_getters[LANG] = lambda text: 'es'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP) tag_map = dict(TAG_MAP)

View File

@ -5,14 +5,16 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class FinnishDefaults(Language.Defaults): class FinnishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fi' lex_attr_getters[LANG] = lambda text: 'fi'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -7,15 +7,17 @@ from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class FrenchDefaults(Language.Defaults): class FrenchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fr' lex_attr_getters[LANG] = lambda text: 'fr'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -7,15 +7,17 @@ from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class HungarianDefaults(Language.Defaults): class HungarianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'hu' lex_attr_getters[LANG] = lambda text: 'hu'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -5,15 +5,17 @@ from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class ItalianDefaults(Language.Defaults): class ItalianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'it' lex_attr_getters[LANG] = lambda text: 'it'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -6,14 +6,16 @@ from .stop_words import STOP_WORDS
from .morph_rules import MORPH_RULES from .morph_rules import MORPH_RULES
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class NorwegianDefaults(Language.Defaults): class NorwegianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nb' lex_attr_getters[LANG] = lambda text: 'nb'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -4,14 +4,16 @@ from __future__ import unicode_literals
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class DutchDefaults(Language.Defaults): class DutchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nl' lex_attr_getters[LANG] = lambda text: 'nl'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -4,14 +4,16 @@ from __future__ import unicode_literals
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class PolishDefaults(Language.Defaults): class PolishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pl' lex_attr_getters[LANG] = lambda text: 'pl'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -7,15 +7,17 @@ from .lex_attrs import LEX_ATTRS
from .lemmatizer import LOOKUP from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class PortugueseDefaults(Language.Defaults): class PortugueseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pt' lex_attr_getters[LANG] = lambda text: 'pt'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)

View File

@ -7,15 +7,17 @@ from .morph_rules import MORPH_RULES
from .lemmatizer import LEMMA_RULES, LOOKUP from .lemmatizer import LEMMA_RULES, LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class SwedishDefaults(Language.Defaults): class SwedishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'sv' lex_attr_getters[LANG] = lambda text: 'sv'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -3,14 +3,16 @@ from __future__ import unicode_literals
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class MultiLanguageDefaults(Language.Defaults): class MultiLanguageDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'xx' lex_attr_getters[LANG] = lambda text: 'xx'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)

View File

@ -383,3 +383,14 @@ mixin annotation-row(annots, style)
else else
+cell=cell +cell=cell
block block
//- Table of contents, to be used with +item mixins for links
col - [string] width of column (see +grid-col)
mixin table-of-contents(col)
+grid-col(col || "half")
+infobox
+label.o-block-small Table of contents
+list("numbers").u-text-small.o-no-block
block

View File

@ -3,32 +3,51 @@
include ../../_includes/_mixins include ../../_includes/_mixins
p p
| Adding full support for a language touches many different parts of the | Adding full support for a language touches many different parts of the
| spaCy library. This guide explains how to fit everything together, and | spaCy library. This guide explains how to fit everything together, and
| points you to the specific workflows for each component. Obviously, | points you to the specific workflows for each component.
| there are lots of ways you can organise your code when you implement
| your own #[+api("language") #[code Language]] class. This guide will
| focus on how it's done within spaCy. For full language support, we'll
| need to:
+list("numbers") +grid.o-no-block
+item +grid-col("half")
| Create a #[strong #[code Language] subclass]. p
+item | Obviously, there are lots of ways you can organise your code when
| Define custom #[strong language data], like a stop list and tokenizer | you implement your own language data. This guide will focus on
| exceptions. | how it's done within spaCy. For full language support, you'll
+item | need to create a #[code Language] subclass, define custom
| #[strong Test] the new language tokenizer. | #[strong language data], like a stop list and tokenizer
+item | exceptions and test the new tokenizer. Once the language is set
| #[strong Build the vocabulary], including word frequencies, Brown | up, you can #[strong build the vocabulary], including word
| clusters and word vectors. | frequencies, Brown clusters and word vectors. Finally, you can
+item | #[strong train the tagger and parser], and save the model to a
| Set up a #[strong model direcory] and #[strong train] the tagger and | directory.
| parser.
p p
| For some languages, you may also want to develop a solution for | For some languages, you may also want to develop a solution for
| lemmatization and morphological analysis. | lemmatization and morphological analysis.
+table-of-contents
+item #[+a("#language-subclass") The Language subclass]
+item #[+a("#language-data") Adding language data]
+item #[+a("#stop-workds") Stop words]
+item #[+a("#tokenizer-exceptions") Tokenizer exceptions]
+item #[+a("#norm-exceptions") Norm exceptions]
+item #[+a("#lex-attrs") Lexical attributes]
+item #[+a("#lemmatizer") Lemmatizer]
+item #[+a("#tag-map") Tag map]
+item #[+a("#morph-rules") Morph rules]
+item #[+a("#testing") Testing the tokenizer]
+item #[+a("#vocabulary") Building the vocabulary]
+item #[+a("#training") Training]
+aside("Working on spaCy's source")
| To add a new language to spaCy, you'll need to
| #[strong modify the library's code]. The easiest way to do this is to
| clone the #[+src(gh("spaCy")) repository] and #[strong build spaCy from source].
| For more information on this, see the #[+a("/docs/usage") installation guide].
| Unlike spaCy's core, which is mostly written in Cython, all language
| data is stored in regular Python files. This means that you won't have to
| rebuild anything in between you can simply make edits and reload spaCy
| to test them.
+h(2, "language-subclass") Creating a #[code Language] subclass +h(2, "language-subclass") Creating a #[code Language] subclass
@ -123,6 +142,14 @@ p
| Special-case rules for the tokenizer, for example, contractions | Special-case rules for the tokenizer, for example, contractions
| and abbreviations containing punctuation. | and abbreviations containing punctuation.
+row
+cell #[+src(gh("spaCy", "spacy/lang/norm_exceptions.py")) norm_exceptions.py]
+cell
| #[code NORM_EXCEPTIONS] (dict)
+cell
| Special-case rules for normalising tokens and assigning norms,
| for example American vs. British spelling.
+row +row
+cell #[+src(gh("spaCy", "spacy/lang/punctuation.py")) punctuation.py] +cell #[+src(gh("spaCy", "spacy/lang/punctuation.py")) punctuation.py]
+cell +cell
@ -235,7 +262,7 @@ p
TOKENIZER_EXCEPTIONS = { TOKENIZER_EXCEPTIONS = {
"don't": [ "don't": [
{ORTH: "do", LEMMA: "do"}, {ORTH: "do", LEMMA: "do"},
{ORTH: "n't", LEMMA: "not", TAG: "RB"}] {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
} }
+infobox("Important note") +infobox("Important note")
@ -286,7 +313,7 @@ p
p p
| When adding the tokenizer exceptions to the #[code Defaults], you can use | When adding the tokenizer exceptions to the #[code Defaults], you can use
| the #[+api("util#update_exc") #[code update_exc()]] helper function to merge | the #[+api("util#update_exc") #[code update_exc()]] helper function to merge
| them with the global base exceptions (including one-letter abbreviations | them with the global base exceptions (including one-letter abbreviations
| and emoticons). The function performs a basic check to make sure | and emoticons). The function performs a basic check to make sure
| exceptions are provided in the correct format. It can take any number of | exceptions are provided in the correct format. It can take any number of
| exceptions dicts as its arguments, and will update and overwrite the | exceptions dicts as its arguments, and will update and overwrite the
@ -303,13 +330,74 @@ p
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
# {"a.": [{ORTH: "a.", LEMMA: "all"}], ":)": [{ORTH: ":)"}]} # {"a.": [{ORTH: "a.", LEMMA: "all"}], ":)": [{ORTH: ":)"}]}
//-+aside("About spaCy's custom pronoun lemma") +infobox("About spaCy's custom pronoun lemma")
| Unlike verbs and common nouns, there's no clear base form of a personal | Unlike verbs and common nouns, there's no clear base form of a personal
| pronoun. Should the lemma of "me" be "I", or should we normalize person | pronoun. Should the lemma of "me" be "I", or should we normalize person
| as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a | as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
| novel symbol, #[code.u-nowrap -PRON-], which is used as the lemma for | novel symbol, #[code.u-nowrap -PRON-], which is used as the lemma for
| all personal pronouns. | all personal pronouns.
+h(3, "norm-exceptions") Norm exceptions
p
| In addition to #[code ORTH] or #[code LEMMA], tokenizer exceptions can
| also set a #[code NORM] attribute. This is useful to specify a normalised
| version of the token for example, the norm of "n't" is "not". By default,
| a token's norm equals its lowercase text. If the lowercase spelling of a
| word exists, norms should always be in lowercase.
+aside-code("Accessing norms").
doc = nlp(u"I can't")
assert [t.norm_ for t in doc] == ['i', 'can', 'not']
p
| spaCy usually tries to normalise words with different spellings to a single,
| common spelling. This has no effect on any other token attributes, or
| tokenization in general, but it ensures that
| #[strong equivalent tokens receive similar representations]. This can
| improve the model's predictions on words that weren't common in the
| training data, but are equivalent to other words for example, "realize"
| and "realise", or "thx" and "thanks".
p
| Similarly, spaCy also includes
| #[+src(gh("spaCy", "spacy/lang/norm_exceptions.py")) global base norms]
| for normalising different styles of quotation marks and currency
| symbols. Even though #[code $] and #[code €] are very different, spaCy
| normalises them both to #[code $]. This way, they'll always be seen as
| similar, no matter how common they were in the training data.
p
| Norm exceptions can be provided as a simple dictionary. For more examples,
| see the English
| #[+src(gh("spaCy", "spacy/lang/en/norm_exceptions.py")) norm_exceptions.py].
+code("Example").
NORM_EXCEPTIONS = {
"cos": "because",
"fav": "favorite",
"accessorise": "accessorize",
"accessorised": "accessorized"
}
p
| To add the custom norm exceptions lookup table, you can use the
| #[code add_lookups()] helper functions. It takes the default attribute
| getter function as its first argument, plus a variable list of
| dictionaries. If a string's norm is found in one of the dictionaries,
| that value is used otherwise, the default function is called and the
| token is assigned its default norm.
+code.
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
NORM_EXCEPTIONS, BASE_NORMS)
p
| The order of the dictionaries is also the lookup order so if your
| language's norm exceptions overwrite any of the global exceptions, they
| should be added first. Also note that the tokenizer exceptions will
| always have priority over the atrribute getters.
+h(3, "lex-attrs") Lexical attributes +h(3, "lex-attrs") Lexical attributes
p p

View File

@ -38,22 +38,19 @@ p
| #[strong natural language understanding] systems, or to | #[strong natural language understanding] systems, or to
| pre-process text for #[strong deep learning]. | pre-process text for #[strong deep learning].
+grid-col("half") +table-of-contents
+infobox +item #[+a("#features") Features]
+label.o-block-small Table of contents +item #[+a("#annotations") Linguistic annotations]
+list("numbers").u-text-small.o-no-block +item #[+a("#annotations-token") Tokenization]
+item #[+a("#features") Features] +item #[+a("#annotations-pos-deps") POS tags and dependencies]
+item #[+a("#annotations") Linguistic annotations] +item #[+a("#annotations-ner") Named entities]
+item #[+a("#annotations-token") Tokenization] +item #[+a("#vectors-similarity") Word vectos and similarity]
+item #[+a("#annotations-pos-deps") POS tags and dependencies] +item #[+a("#pipelines") Pipelines]
+item #[+a("#annotations-ner") Named entities] +item #[+a("#vocab") Vocab, hashes and lexemes]
+item #[+a("#vectors-similarity") Word vectos and similarity] +item #[+a("#serialization") Serialization]
+item #[+a("#pipelines") Pipelines] +item #[+a("#training") Training]
+item #[+a("#vocab") Vocab, hashes and lexemes] +item #[+a("#architecture") Architecture]
+item #[+a("#serialization") Serialization] +item #[+a("#community") Community & FAQ]
+item #[+a("#training") Training]
+item #[+a("#architecture") Architecture]
+item #[+a("#community") Community & FAQ]
+h(3, "what-spacy-isnt") What spaCy isn't +h(3, "what-spacy-isnt") What spaCy isn't