From fa7e576c579198072266e43681207c26fcabc954 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 21:52:06 +0200 Subject: [PATCH 1/5] Change order of exception dicts --- spacy/lang/de/__init__.py | 2 +- spacy/lang/en/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index 0a161e80e..b8a7580a0 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -20,7 +20,7 @@ class GermanDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'de' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], - BASE_NORMS, NORM_EXCEPTIONS) + NORM_EXCEPTIONS, BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = dict(TAG_MAP) diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 3f422b834..a6c216b43 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -21,7 +21,7 @@ class EnglishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'en' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], - BASE_NORMS, NORM_EXCEPTIONS) + NORM_EXCEPTIONS, BASE_NORMS) lex_attr_getters.update(LEX_ATTRS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) From ec6d2bc81df0f3532ad558fdc2ac99b361ef4ac3 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 22:16:26 +0200 Subject: [PATCH 2/5] Add table of contents mixin --- website/_includes/_mixins.jade | 11 +++++++++++ website/docs/usage/spacy-101.jade | 29 +++++++++++++---------------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index 9de43b092..16514bcda 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -383,3 +383,14 @@ mixin annotation-row(annots, style) else +cell=cell block + + +//- Table of contents, to be used with +item mixins for links + col - [string] width of column (see +grid-col) + +mixin table-of-contents(col) + +grid-col(col || "half") + +infobox + +label.o-block-small Table of contents + +list("numbers").u-text-small.o-no-block + block diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 55e7a030a..03897600d 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -38,22 +38,19 @@ p | #[strong natural language understanding] systems, or to | pre-process text for #[strong deep learning]. - +grid-col("half") - +infobox - +label.o-block-small Table of contents - +list("numbers").u-text-small.o-no-block - +item #[+a("#features") Features] - +item #[+a("#annotations") Linguistic annotations] - +item #[+a("#annotations-token") Tokenization] - +item #[+a("#annotations-pos-deps") POS tags and dependencies] - +item #[+a("#annotations-ner") Named entities] - +item #[+a("#vectors-similarity") Word vectos and similarity] - +item #[+a("#pipelines") Pipelines] - +item #[+a("#vocab") Vocab, hashes and lexemes] - +item #[+a("#serialization") Serialization] - +item #[+a("#training") Training] - +item #[+a("#architecture") Architecture] - +item #[+a("#community") Community & FAQ] + +table-of-contents + +item #[+a("#features") Features] + +item #[+a("#annotations") Linguistic annotations] + +item #[+a("#annotations-token") Tokenization] + +item #[+a("#annotations-pos-deps") POS tags and dependencies] + +item #[+a("#annotations-ner") Named entities] + +item #[+a("#vectors-similarity") Word vectos and similarity] + +item #[+a("#pipelines") Pipelines] + +item #[+a("#vocab") Vocab, hashes and lexemes] + +item #[+a("#serialization") Serialization] + +item #[+a("#training") Training] + +item #[+a("#architecture") Architecture] + +item #[+a("#community") Community & FAQ] +h(3, "what-spacy-isnt") What spaCy isn't From a3715a81d5a1b9a5309920dd987fd8c167dea689 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 22:16:38 +0200 Subject: [PATCH 3/5] Update adding languages guide --- website/docs/usage/adding-languages.jade | 142 ++++++++++++++++++----- 1 file changed, 115 insertions(+), 27 deletions(-) diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index 005c4e750..c900734d4 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -3,32 +3,51 @@ include ../../_includes/_mixins p - | Adding full support for a language touches many different parts of the - | spaCy library. This guide explains how to fit everything together, and - | points you to the specific workflows for each component. Obviously, - | there are lots of ways you can organise your code when you implement - | your own #[+api("language") #[code Language]] class. This guide will - | focus on how it's done within spaCy. For full language support, we'll - | need to: + | Adding full support for a language touches many different parts of the + | spaCy library. This guide explains how to fit everything together, and + | points you to the specific workflows for each component. -+list("numbers") - +item - | Create a #[strong #[code Language] subclass]. - +item - | Define custom #[strong language data], like a stop list and tokenizer - | exceptions. - +item - | #[strong Test] the new language tokenizer. - +item - | #[strong Build the vocabulary], including word frequencies, Brown - | clusters and word vectors. - +item - | Set up a #[strong model direcory] and #[strong train] the tagger and - | parser. ++grid.o-no-block + +grid-col("half") + p + | Obviously, there are lots of ways you can organise your code when + | you implement your own language data. This guide will focus on + | how it's done within spaCy. For full language support, you'll + | need to create a #[code Language] subclass, define custom + | #[strong language data], like a stop list and tokenizer + | exceptions and test the new tokenizer. Once the language is set + | up, you can #[strong build the vocabulary], including word + | frequencies, Brown clusters and word vectors. Finally, you can + | #[strong train the tagger and parser], and save the model to a + | directory. -p - | For some languages, you may also want to develop a solution for - | lemmatization and morphological analysis. + p + | For some languages, you may also want to develop a solution for + | lemmatization and morphological analysis. + + +table-of-contents + +item #[+a("#language-subclass") The Language subclass] + +item #[+a("#language-data") Adding language data] + +item #[+a("#stop-workds") Stop words] + +item #[+a("#tokenizer-exceptions") Tokenizer exceptions] + +item #[+a("#norm-exceptions") Norm exceptions] + +item #[+a("#lex-attrs") Lexical attributes] + +item #[+a("#lemmatizer") Lemmatizer] + +item #[+a("#tag-map") Tag map] + +item #[+a("#morph-rules") Morph rules] + +item #[+a("#testing") Testing the tokenizer] + +item #[+a("#vocabulary") Building the vocabulary] + +item #[+a("#training") Training] + ++aside("Working on spaCy's source") + | To add a new language to spaCy, you'll need to + | #[strong modify the library's code]. The easiest way to do this is to + | clone the #[+src(gh("spaCy")) repository] and #[strong build spaCy from source]. + | For more information on this, see the #[+a("/docs/usage") installation guide]. + | Unlike spaCy's core, which is mostly written in Cython, all language + | data is stored in regular Python files. This means that you won't have to + | rebuild anything in between – you can simply make edits and reload spaCy + | to test them. +h(2, "language-subclass") Creating a #[code Language] subclass @@ -123,6 +142,14 @@ p | Special-case rules for the tokenizer, for example, contractions | and abbreviations containing punctuation. + +row + +cell #[+src(gh("spaCy", "spacy/lang/norm_exceptions.py")) norm_exceptions.py] + +cell + | #[code NORM_EXCEPTIONS] (dict) + +cell + | Special-case rules for normalising tokens and assigning norms, + | for example American vs. British spelling. + +row +cell #[+src(gh("spaCy", "spacy/lang/punctuation.py")) punctuation.py] +cell @@ -235,7 +262,7 @@ p TOKENIZER_EXCEPTIONS = { "don't": [ {ORTH: "do", LEMMA: "do"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}] + {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}] } +infobox("Important note") @@ -286,7 +313,7 @@ p p | When adding the tokenizer exceptions to the #[code Defaults], you can use | the #[+api("util#update_exc") #[code update_exc()]] helper function to merge - | them with the global base exceptions (including one-letter abbreviations + | them with the global base exceptions (including one-letter abbreviations | and emoticons). The function performs a basic check to make sure | exceptions are provided in the correct format. It can take any number of | exceptions dicts as its arguments, and will update and overwrite the @@ -303,13 +330,74 @@ p tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) # {"a.": [{ORTH: "a.", LEMMA: "all"}], ":)": [{ORTH: ":)"}]} -//-+aside("About spaCy's custom pronoun lemma") ++infobox("About spaCy's custom pronoun lemma") | Unlike verbs and common nouns, there's no clear base form of a personal | pronoun. Should the lemma of "me" be "I", or should we normalize person | as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a | novel symbol, #[code.u-nowrap -PRON-], which is used as the lemma for | all personal pronouns. ++h(3, "norm-exceptions") Norm exceptions + +p + | In addition to #[code ORTH] or #[code LEMMA], tokenizer exceptions can + | also set a #[code NORM] attribute. This is useful to specify a normalised + | version of the token – for example, the norm of "n't" is "not". By default, + | a token's norm equals its lowercase text. If the lowercase spelling of a + | word exists, norms should always be in lowercase. + ++aside-code("Accessing norms"). + doc = nlp(u"I can't") + assert [t.norm_ for t in doc] == ['i', 'can', 'not'] + +p + | spaCy usually tries to normalise words with different spellings to a single, + | common spelling. This has no effect on any other token attributes, or + | tokenization in general, but it ensures that + | #[strong equivalent tokens receive similar representations]. This can + | improve the model's predictions on words that weren't common in the + | training data, but are equivalent to other words – for example, "realize" + | and "realise", or "thx" and "thanks". + +p + | Similarly, spaCy also includes + | #[+src(gh("spaCy", "spacy/lang/norm_exceptions.py")) global base norms] + | for normalising different styles of quotation marks and currency + | symbols. Even though #[code $] and #[code €] are very different, spaCy + | normalises them both to #[code $]. This way, they'll always be seen as + | similar, no matter how common they were in the training data. + +p + | Norm exceptions can be provided as a simple dictionary. For more examples, + | see the English + | #[+src(gh("spaCy", "spacy/lang/en/norm_exceptions.py")) norm_exceptions.py]. + ++code("Example"). + NORM_EXCEPTIONS = { + "cos": "because", + "fav": "favorite", + "accessorise": "accessorize", + "accessorised": "accessorized" + } + +p + | To add the custom norm exceptions lookup table, you can use the + | #[code add_lookups()] helper functions. It takes the default attribute + | getter function as its first argument, plus a variable list of + | dictionaries. If a string's norm is found in one of the dictionaries, + | that value is used – otherwise, the default function is called and the + | token is assigned its default norm. + ++code. + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], + NORM_EXCEPTIONS, BASE_NORMS) + +p + | The order of the dictionaries is also the lookup order – so if your + | language's norm exceptions overwrite any of the global exceptions, they + | should be added first. Also note that the tokenizer exceptions will + | always have priority over the atrribute getters. + +h(3, "lex-attrs") Lexical attributes p From 4c643d74c5a1a873e0a345f158f587b8f322f85c Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 22:29:21 +0200 Subject: [PATCH 4/5] Add norm exceptions to other Language classes --- spacy/lang/da/__init__.py | 6 ++++-- spacy/lang/es/__init__.py | 6 ++++-- spacy/lang/fi/__init__.py | 6 ++++-- spacy/lang/fr/__init__.py | 6 ++++-- spacy/lang/hu/__init__.py | 6 ++++-- spacy/lang/it/__init__.py | 6 ++++-- spacy/lang/nb/__init__.py | 6 ++++-- spacy/lang/nl/__init__.py | 6 ++++-- spacy/lang/pl/__init__.py | 6 ++++-- spacy/lang/pt/__init__.py | 6 ++++-- spacy/lang/sv/__init__.py | 6 ++++-- spacy/lang/xx/__init__.py | 6 ++++-- 12 files changed, 48 insertions(+), 24 deletions(-) diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index b9e90dc0d..99babdc2c 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -5,14 +5,16 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG -from ...util import update_exc +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups class DanishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'da' + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index 8291b2dd0..e20338b39 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -7,15 +7,17 @@ from .stop_words import STOP_WORDS from .lemmatizer import LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS from ...language import Language from ...lemmatizerlookup import Lemmatizer -from ...attrs import LANG -from ...util import update_exc +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups class SpanishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'es' + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = dict(TAG_MAP) diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py index 7010acd48..931ad5341 100644 --- a/spacy/lang/fi/__init__.py +++ b/spacy/lang/fi/__init__.py @@ -5,14 +5,16 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG -from ...util import update_exc +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups class FinnishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'fi' + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index f9a01f223..e8c13777f 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -7,15 +7,17 @@ from .stop_words import STOP_WORDS from .lemmatizer import LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS from ...language import Language from ...lemmatizerlookup import Lemmatizer -from ...attrs import LANG -from ...util import update_exc +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups class FrenchDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'fr' + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py index 70b4ae5cc..0fe6a9f5c 100644 --- a/spacy/lang/hu/__init__.py +++ b/spacy/lang/hu/__init__.py @@ -7,15 +7,17 @@ from .stop_words import STOP_WORDS from .lemmatizer import LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS from ...language import Language from ...lemmatizerlookup import Lemmatizer -from ...attrs import LANG -from ...util import update_exc +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups class HungarianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'hu' + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 573a8df16..7cc717cb3 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -5,15 +5,17 @@ from .stop_words import STOP_WORDS from .lemmatizer import LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS from ...language import Language from ...lemmatizerlookup import Lemmatizer -from ...attrs import LANG -from ...util import update_exc +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups class ItalianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'it' + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index cb2baf148..c1b4af263 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -6,14 +6,16 @@ from .stop_words import STOP_WORDS from .morph_rules import MORPH_RULES from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG -from ...util import update_exc +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups class NorwegianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'nb' + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index d6430d0b3..7b948f295 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -4,14 +4,16 @@ from __future__ import unicode_literals from .stop_words import STOP_WORDS from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG -from ...util import update_exc +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups class DutchDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'nl' + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 535120874..067646dbd 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -4,14 +4,16 @@ from __future__ import unicode_literals from .stop_words import STOP_WORDS from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG -from ...util import update_exc +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups class PolishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'pl' + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index df6b76c7a..67539034d 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -7,15 +7,17 @@ from .lex_attrs import LEX_ATTRS from .lemmatizer import LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS from ...language import Language from ...lemmatizerlookup import Lemmatizer -from ...attrs import LANG -from ...util import update_exc +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups class PortugueseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'pt' + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters.update(LEX_ATTRS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index b309643f7..2d3a640c5 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -7,15 +7,17 @@ from .morph_rules import MORPH_RULES from .lemmatizer import LEMMA_RULES, LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS from ...language import Language from ...lemmatizerlookup import Lemmatizer -from ...attrs import LANG -from ...util import update_exc +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups class SwedishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'sv' + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/xx/__init__.py index fef8c9d59..dc63ee33f 100644 --- a/spacy/lang/xx/__init__.py +++ b/spacy/lang/xx/__init__.py @@ -3,14 +3,16 @@ from __future__ import unicode_literals from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG -from ...util import update_exc +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups class MultiLanguageDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'xx' + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) From 8a17b99b1c1107a632729fccf8c558faf2f764b6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 3 Jun 2017 15:30:16 -0500 Subject: [PATCH 5/5] Use NORM attribute, not LOWER --- spacy/_ml.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index c499a5cff..6d02dfd27 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -13,7 +13,7 @@ from thinc import describe from thinc.describe import Dimension, Synapses, Biases, Gradient from thinc.neural._classes.affine import _set_dimensions_if_needed -from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP +from .attrs import ID, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP from .tokens.doc import Doc import numpy @@ -131,14 +131,14 @@ class PrecomputableMaxouts(Model): return Yfp, backward def Tok2Vec(width, embed_size, preprocess=None): - cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE] + cols = [ID, NORM, PREFIX, SUFFIX, SHAPE] with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): - lower = get_col(cols.index(LOWER)) >> HashEmbed(width, embed_size, name='embed_lower') + norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower') prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix') suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix') shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape') - embed = (lower | prefix | suffix | shape ) + embed = (norm | prefix | suffix | shape ) tok2vec = ( with_flatten( asarray(Model.ops, dtype='uint64') @@ -148,7 +148,7 @@ def Tok2Vec(width, embed_size, preprocess=None): >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)), - pad=4, ndim=5) + pad=4) ) if preprocess not in (False, None): tok2vec = preprocess >> tok2vec @@ -243,7 +243,7 @@ def zero_init(model): def doc2feats(cols=None): - cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE] + cols = [ID, NORM, PREFIX, SUFFIX, SHAPE] def forward(docs, drop=0.): feats = [] for doc in docs: