mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-25 00:34:20 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
d0e42f9275
|
@ -194,8 +194,8 @@ class GoldCorpus(object):
|
|||
|
||||
def count_train(self):
|
||||
n = 0
|
||||
for _ in self.train_tuples:
|
||||
n += 1
|
||||
for raw_text, paragraph_tuples in self.train_tuples:
|
||||
n += len(paragraph_tuples)
|
||||
return n
|
||||
|
||||
def train_docs(self, nlp, gold_preproc=False,
|
||||
|
|
|
@ -5,14 +5,16 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class DanishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'da'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
|
|
@ -2,21 +2,25 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lemmatizer import LOOKUP
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class GermanDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'de'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
||||
NORM_EXCEPTIONS, BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = dict(TAG_MAP)
|
||||
|
|
17
spacy/lang/de/norm_exceptions.py
Normal file
17
spacy/lang/de/norm_exceptions.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# Here we only want to include the absolute most common words. Otherwise,
|
||||
# this list would get impossibly long for German – especially considering the
|
||||
# old vs. new spelling rules, and all possible cases.
|
||||
|
||||
|
||||
_exc = {
|
||||
"daß": "dass"
|
||||
}
|
||||
|
||||
|
||||
NORM_EXCEPTIONS = {}
|
||||
|
||||
for string, norm in _exc.items():
|
||||
NORM_EXCEPTIONS[string.title()] = norm
|
|
@ -8,7 +8,7 @@ from ...deprecated import PRON_LEMMA
|
|||
_exc = {
|
||||
"auf'm": [
|
||||
{ORTH: "auf", LEMMA: "auf"},
|
||||
{ORTH: "'m", LEMMA: "der", NORM: "dem" }],
|
||||
{ORTH: "'m", LEMMA: "der", NORM: "dem"}],
|
||||
|
||||
"du's": [
|
||||
{ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||
|
@ -53,97 +53,97 @@ _exc = {
|
|||
|
||||
|
||||
for exc_data in [
|
||||
{ORTH: "'S", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||
{ORTH: "S'", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||
{ORTH: "s'", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||
{ORTH: "'S", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
|
||||
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
|
||||
{ORTH: "S'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
|
||||
{ORTH: "s'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
|
||||
{ORTH: "'n", LEMMA: "ein", NORM: "ein"},
|
||||
{ORTH: "'ne", LEMMA: "eine", NORM: "eine"},
|
||||
{ORTH: "'nen", LEMMA: "ein", NORM: "einen"},
|
||||
{ORTH: "'nem", LEMMA: "ein", NORM: "einem"},
|
||||
{ORTH: "Abb.", LEMMA: "Abbildung"},
|
||||
{ORTH: "Abk.", LEMMA: "Abkürzung"},
|
||||
{ORTH: "Abt.", LEMMA: "Abteilung"},
|
||||
{ORTH: "Apr.", LEMMA: "April"},
|
||||
{ORTH: "Aug.", LEMMA: "August"},
|
||||
{ORTH: "Bd.", LEMMA: "Band"},
|
||||
{ORTH: "Betr.", LEMMA: "Betreff"},
|
||||
{ORTH: "Bf.", LEMMA: "Bahnhof"},
|
||||
{ORTH: "Bhf.", LEMMA: "Bahnhof"},
|
||||
{ORTH: "Bsp.", LEMMA: "Beispiel"},
|
||||
{ORTH: "Dez.", LEMMA: "Dezember"},
|
||||
{ORTH: "Di.", LEMMA: "Dienstag"},
|
||||
{ORTH: "Do.", LEMMA: "Donnerstag"},
|
||||
{ORTH: "Fa.", LEMMA: "Firma"},
|
||||
{ORTH: "Fam.", LEMMA: "Familie"},
|
||||
{ORTH: "Feb.", LEMMA: "Februar"},
|
||||
{ORTH: "Fr.", LEMMA: "Frau"},
|
||||
{ORTH: "Frl.", LEMMA: "Fräulein"},
|
||||
{ORTH: "Hbf.", LEMMA: "Hauptbahnhof"},
|
||||
{ORTH: "Hr.", LEMMA: "Herr"},
|
||||
{ORTH: "Hrn.", LEMMA: "Herr"},
|
||||
{ORTH: "Jan.", LEMMA: "Januar"},
|
||||
{ORTH: "Jh.", LEMMA: "Jahrhundert"},
|
||||
{ORTH: "Jhd.", LEMMA: "Jahrhundert"},
|
||||
{ORTH: "Jul.", LEMMA: "Juli"},
|
||||
{ORTH: "Jun.", LEMMA: "Juni"},
|
||||
{ORTH: "Mi.", LEMMA: "Mittwoch"},
|
||||
{ORTH: "Mio.", LEMMA: "Million"},
|
||||
{ORTH: "Mo.", LEMMA: "Montag"},
|
||||
{ORTH: "Mrd.", LEMMA: "Milliarde"},
|
||||
{ORTH: "Mrz.", LEMMA: "März"},
|
||||
{ORTH: "MwSt.", LEMMA: "Mehrwertsteuer"},
|
||||
{ORTH: "Mär.", LEMMA: "März"},
|
||||
{ORTH: "Nov.", LEMMA: "November"},
|
||||
{ORTH: "Nr.", LEMMA: "Nummer"},
|
||||
{ORTH: "Okt.", LEMMA: "Oktober"},
|
||||
{ORTH: "Orig.", LEMMA: "Original"},
|
||||
{ORTH: "Pkt.", LEMMA: "Punkt"},
|
||||
{ORTH: "Prof.", LEMMA: "Professor"},
|
||||
{ORTH: "Red.", LEMMA: "Redaktion"},
|
||||
{ORTH: "Sa.", LEMMA: "Samstag"},
|
||||
{ORTH: "Sep.", LEMMA: "September"},
|
||||
{ORTH: "Sept.", LEMMA: "September"},
|
||||
{ORTH: "So.", LEMMA: "Sonntag"},
|
||||
{ORTH: "Std.", LEMMA: "Stunde"},
|
||||
{ORTH: "Str.", LEMMA: "Straße"},
|
||||
{ORTH: "Tel.", LEMMA: "Telefon"},
|
||||
{ORTH: "Tsd.", LEMMA: "Tausend"},
|
||||
{ORTH: "Univ.", LEMMA: "Universität"},
|
||||
{ORTH: "abzgl.", LEMMA: "abzüglich"},
|
||||
{ORTH: "allg.", LEMMA: "allgemein"},
|
||||
{ORTH: "bspw.", LEMMA: "beispielsweise"},
|
||||
{ORTH: "bzgl.", LEMMA: "bezüglich"},
|
||||
{ORTH: "bzw.", LEMMA: "beziehungsweise"},
|
||||
{ORTH: "Abb.", LEMMA: "Abbildung", NORM: "Abbildung"},
|
||||
{ORTH: "Abk.", LEMMA: "Abkürzung", NORM: "Abkürzung"},
|
||||
{ORTH: "Abt.", LEMMA: "Abteilung", NORM: "Abteilung"},
|
||||
{ORTH: "Apr.", LEMMA: "April", NORM: "April"},
|
||||
{ORTH: "Aug.", LEMMA: "August", NORM: "August"},
|
||||
{ORTH: "Bd.", LEMMA: "Band", NORM: "Band"},
|
||||
{ORTH: "Betr.", LEMMA: "Betreff", NORM: "Betreff"},
|
||||
{ORTH: "Bf.", LEMMA: "Bahnhof", NORM: "Bahnhof"},
|
||||
{ORTH: "Bhf.", LEMMA: "Bahnhof", NORM: "Bahnhof"},
|
||||
{ORTH: "Bsp.", LEMMA: "Beispiel", NORM: "Beispiel"},
|
||||
{ORTH: "Dez.", LEMMA: "Dezember", NORM: "Dezember"},
|
||||
{ORTH: "Di.", LEMMA: "Dienstag", NORM: "Dienstag"},
|
||||
{ORTH: "Do.", LEMMA: "Donnerstag", NORM: "Donnerstag"},
|
||||
{ORTH: "Fa.", LEMMA: "Firma", NORM: "Firma"},
|
||||
{ORTH: "Fam.", LEMMA: "Familie", NORM: "Familie"},
|
||||
{ORTH: "Feb.", LEMMA: "Februar", NORM: "Februar"},
|
||||
{ORTH: "Fr.", LEMMA: "Frau", NORM: "Frau"},
|
||||
{ORTH: "Frl.", LEMMA: "Fräulein", NORM: "Fräulein"},
|
||||
{ORTH: "Hbf.", LEMMA: "Hauptbahnhof", NORM: "Hauptbahnhof"},
|
||||
{ORTH: "Hr.", LEMMA: "Herr", NORM: "Herr"},
|
||||
{ORTH: "Hrn.", LEMMA: "Herr", NORM: "Herrn"},
|
||||
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
|
||||
{ORTH: "Jh.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"},
|
||||
{ORTH: "Jhd.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"},
|
||||
{ORTH: "Jul.", LEMMA: "Juli", NORM: "Juli"},
|
||||
{ORTH: "Jun.", LEMMA: "Juni", NORM: "Juni"},
|
||||
{ORTH: "Mi.", LEMMA: "Mittwoch", NORM: "Mittwoch"},
|
||||
{ORTH: "Mio.", LEMMA: "Million", NORM: "Million"},
|
||||
{ORTH: "Mo.", LEMMA: "Montag", NORM: "Montag"},
|
||||
{ORTH: "Mrd.", LEMMA: "Milliarde", NORM: "Milliarde"},
|
||||
{ORTH: "Mrz.", LEMMA: "März", NORM: "März"},
|
||||
{ORTH: "MwSt.", LEMMA: "Mehrwertsteuer", NORM: "Mehrwertsteuer"},
|
||||
{ORTH: "Mär.", LEMMA: "März", NORM: "März"},
|
||||
{ORTH: "Nov.", LEMMA: "November", NORM: "November"},
|
||||
{ORTH: "Nr.", LEMMA: "Nummer", NORM: "Nummer"},
|
||||
{ORTH: "Okt.", LEMMA: "Oktober", NORM: "Oktober"},
|
||||
{ORTH: "Orig.", LEMMA: "Original", NORM: "Original"},
|
||||
{ORTH: "Pkt.", LEMMA: "Punkt", NORM: "Punkt"},
|
||||
{ORTH: "Prof.", LEMMA: "Professor", NORM: "Professor"},
|
||||
{ORTH: "Red.", LEMMA: "Redaktion", NORM: "Redaktion"},
|
||||
{ORTH: "Sa.", LEMMA: "Samstag", NORM: "Samstag"},
|
||||
{ORTH: "Sep.", LEMMA: "September", NORM: "September"},
|
||||
{ORTH: "Sept.", LEMMA: "September", NORM: "September"},
|
||||
{ORTH: "So.", LEMMA: "Sonntag", NORM: "Sonntag"},
|
||||
{ORTH: "Std.", LEMMA: "Stunde", NORM: "Stunde"},
|
||||
{ORTH: "Str.", LEMMA: "Straße", NORM: "Straße"},
|
||||
{ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"},
|
||||
{ORTH: "Tsd.", LEMMA: "Tausend", NORM: "Tausend"},
|
||||
{ORTH: "Univ.", LEMMA: "Universität", NORM: "Universität"},
|
||||
{ORTH: "abzgl.", LEMMA: "abzüglich", NORM: "abzüglich"},
|
||||
{ORTH: "allg.", LEMMA: "allgemein", NORM: "allgemein"},
|
||||
{ORTH: "bspw.", LEMMA: "beispielsweise", NORM: "beispielsweise"},
|
||||
{ORTH: "bzgl.", LEMMA: "bezüglich", NORM: "bezüglich"},
|
||||
{ORTH: "bzw.", LEMMA: "beziehungsweise", NORM: "beziehungsweise"},
|
||||
{ORTH: "d.h.", LEMMA: "das heißt"},
|
||||
{ORTH: "dgl.", LEMMA: "dergleichen"},
|
||||
{ORTH: "ebd.", LEMMA: "ebenda"},
|
||||
{ORTH: "eigtl.", LEMMA: "eigentlich"},
|
||||
{ORTH: "engl.", LEMMA: "englisch"},
|
||||
{ORTH: "evtl.", LEMMA: "eventuell"},
|
||||
{ORTH: "frz.", LEMMA: "französisch"},
|
||||
{ORTH: "gegr.", LEMMA: "gegründet"},
|
||||
{ORTH: "ggf.", LEMMA: "gegebenenfalls"},
|
||||
{ORTH: "ggfs.", LEMMA: "gegebenenfalls"},
|
||||
{ORTH: "ggü.", LEMMA: "gegenüber"},
|
||||
{ORTH: "dgl.", LEMMA: "dergleichen", NORM: "dergleichen"},
|
||||
{ORTH: "ebd.", LEMMA: "ebenda", NORM: "ebenda"},
|
||||
{ORTH: "eigtl.", LEMMA: "eigentlich", NORM: "eigentlich"},
|
||||
{ORTH: "engl.", LEMMA: "englisch", NORM: "englisch"},
|
||||
{ORTH: "evtl.", LEMMA: "eventuell", NORM: "eventuell"},
|
||||
{ORTH: "frz.", LEMMA: "französisch", NORM: "französisch"},
|
||||
{ORTH: "gegr.", LEMMA: "gegründet", NORM: "gegründet"},
|
||||
{ORTH: "ggf.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"},
|
||||
{ORTH: "ggfs.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"},
|
||||
{ORTH: "ggü.", LEMMA: "gegenüber", NORM: "gegenüber"},
|
||||
{ORTH: "i.O.", LEMMA: "in Ordnung"},
|
||||
{ORTH: "i.d.R.", LEMMA: "in der Regel"},
|
||||
{ORTH: "incl.", LEMMA: "inklusive"},
|
||||
{ORTH: "inkl.", LEMMA: "inklusive"},
|
||||
{ORTH: "insb.", LEMMA: "insbesondere"},
|
||||
{ORTH: "kath.", LEMMA: "katholisch"},
|
||||
{ORTH: "lt.", LEMMA: "laut"},
|
||||
{ORTH: "max.", LEMMA: "maximal"},
|
||||
{ORTH: "min.", LEMMA: "minimal"},
|
||||
{ORTH: "mind.", LEMMA: "mindestens"},
|
||||
{ORTH: "mtl.", LEMMA: "monatlich"},
|
||||
{ORTH: "incl.", LEMMA: "inklusive", NORM: "inklusive"},
|
||||
{ORTH: "inkl.", LEMMA: "inklusive", NORM: "inklusive"},
|
||||
{ORTH: "insb.", LEMMA: "insbesondere", NORM: "insbesondere"},
|
||||
{ORTH: "kath.", LEMMA: "katholisch", NORM: "katholisch"},
|
||||
{ORTH: "lt.", LEMMA: "laut", NORM: "laut"},
|
||||
{ORTH: "max.", LEMMA: "maximal", NORM: "maximal"},
|
||||
{ORTH: "min.", LEMMA: "minimal", NORM: "minimal"},
|
||||
{ORTH: "mind.", LEMMA: "mindestens", NORM: "mindestens"},
|
||||
{ORTH: "mtl.", LEMMA: "monatlich", NORM: "monatlich"},
|
||||
{ORTH: "n.Chr.", LEMMA: "nach Christus"},
|
||||
{ORTH: "orig.", LEMMA: "original"},
|
||||
{ORTH: "röm.", LEMMA: "römisch"},
|
||||
{ORTH: "orig.", LEMMA: "original", NORM: "original"},
|
||||
{ORTH: "röm.", LEMMA: "römisch", NORM: "römisch"},
|
||||
{ORTH: "s.o.", LEMMA: "siehe oben"},
|
||||
{ORTH: "sog.", LEMMA: "so genannt"},
|
||||
{ORTH: "stellv.", LEMMA: "stellvertretend"},
|
||||
{ORTH: "tägl.", LEMMA: "täglich"},
|
||||
{ORTH: "tägl.", LEMMA: "täglich", NORM: "täglich"},
|
||||
{ORTH: "u.U.", LEMMA: "unter Umständen"},
|
||||
{ORTH: "u.s.w.", LEMMA: "und so weiter"},
|
||||
{ORTH: "u.v.m.", LEMMA: "und vieles mehr"},
|
||||
|
@ -153,9 +153,9 @@ for exc_data in [
|
|||
{ORTH: "v.Chr.", LEMMA: "vor Christus"},
|
||||
{ORTH: "v.a.", LEMMA: "vor allem"},
|
||||
{ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"},
|
||||
{ORTH: "vgl.", LEMMA: "vergleiche"},
|
||||
{ORTH: "vllt.", LEMMA: "vielleicht"},
|
||||
{ORTH: "vlt.", LEMMA: "vielleicht"},
|
||||
{ORTH: "vgl.", LEMMA: "vergleiche", NORM: "vergleiche"},
|
||||
{ORTH: "vllt.", LEMMA: "vielleicht", NORM: "vielleicht"},
|
||||
{ORTH: "vlt.", LEMMA: "vielleicht", NORM: "vielleicht"},
|
||||
{ORTH: "z.B.", LEMMA: "zum Beispiel"},
|
||||
{ORTH: "z.Bsp.", LEMMA: "zum Beispiel"},
|
||||
{ORTH: "z.T.", LEMMA: "zum Teil"},
|
||||
|
@ -163,7 +163,7 @@ for exc_data in [
|
|||
{ORTH: "z.Zt.", LEMMA: "zur Zeit"},
|
||||
{ORTH: "z.b.", LEMMA: "zum Beispiel"},
|
||||
{ORTH: "zzgl.", LEMMA: "zuzüglich"},
|
||||
{ORTH: "österr.", LEMMA: "österreichisch"}]:
|
||||
{ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]:
|
||||
_exc[exc_data[ORTH]] = [dict(exc_data)]
|
||||
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@ class EnglishDefaults(Language.Defaults):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'en'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
||||
BASE_NORMS, NORM_EXCEPTIONS)
|
||||
NORM_EXCEPTIONS, BASE_NORMS)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
|
|
|
@ -1754,8 +1754,7 @@ _exc = {
|
|||
}
|
||||
|
||||
|
||||
NORM_EXCEPTIONS = {}
|
||||
|
||||
for string, norm in _exc.items():
|
||||
_exc[string.title()] = norm
|
||||
|
||||
|
||||
NORM_EXCEPTIONS = _exc
|
||||
NORM_EXCEPTIONS[string.title()] = norm
|
||||
|
|
|
@ -7,15 +7,17 @@ from .stop_words import STOP_WORDS
|
|||
from .lemmatizer import LOOKUP
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class SpanishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'es'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = dict(TAG_MAP)
|
||||
|
|
|
@ -5,14 +5,16 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class FinnishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'fi'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
|
|
@ -7,15 +7,17 @@ from .stop_words import STOP_WORDS
|
|||
from .lemmatizer import LOOKUP
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class FrenchDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'fr'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
|
|
@ -7,15 +7,17 @@ from .stop_words import STOP_WORDS
|
|||
from .lemmatizer import LOOKUP
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class HungarianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'hu'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
|
|
@ -5,15 +5,17 @@ from .stop_words import STOP_WORDS
|
|||
from .lemmatizer import LOOKUP
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class ItalianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'it'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
|
|
@ -6,14 +6,16 @@ from .stop_words import STOP_WORDS
|
|||
from .morph_rules import MORPH_RULES
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class NorwegianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'nb'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
|
|
@ -4,14 +4,16 @@ from __future__ import unicode_literals
|
|||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class DutchDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'nl'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
|
|
@ -4,14 +4,16 @@ from __future__ import unicode_literals
|
|||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class PolishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'pl'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
|
|
@ -7,15 +7,17 @@ from .lex_attrs import LEX_ATTRS
|
|||
from .lemmatizer import LOOKUP
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class PortugueseDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'pt'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
|
|
|
@ -7,15 +7,17 @@ from .morph_rules import MORPH_RULES
|
|||
from .lemmatizer import LEMMA_RULES, LOOKUP
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class SwedishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'sv'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
|
|
@ -3,14 +3,16 @@ from __future__ import unicode_literals
|
|||
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class MultiLanguageDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'xx'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
|
||||
|
|
|
@ -8,20 +8,33 @@ import pytest
|
|||
|
||||
|
||||
@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"])
|
||||
def test_tokenizer_splits_contractions(de_tokenizer, text):
|
||||
def test_de_tokenizer_splits_contractions(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
|
||||
def test_tokenizer_handles_abbr(de_tokenizer, text):
|
||||
def test_de_tokenizer_handles_abbr(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
def test_tokenizer_handles_exc_in_text(de_tokenizer):
|
||||
def test_de_tokenizer_handles_exc_in_text(de_tokenizer):
|
||||
text = "Ich bin z.Zt. im Urlaub."
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 6
|
||||
assert tokens[2].text == "z.Zt."
|
||||
assert tokens[2].lemma_ == "zur Zeit"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,norms', [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])])
|
||||
def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
|
||||
tokens = de_tokenizer(text)
|
||||
assert [token.norm_ for token in tokens] == norms
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text,norm', [("daß", "dass")])
|
||||
def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
|
||||
tokens = de_tokenizer(text)
|
||||
assert tokens[0].norm_ == norm
|
||||
|
|
|
@ -102,3 +102,16 @@ def test_en_tokenizer_handles_times(en_tokenizer, text):
|
|||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[1].lemma_ in ["a.m.", "p.m."]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,norms', [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])])
|
||||
def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
|
||||
tokens = en_tokenizer(text)
|
||||
assert [token.norm_ for token in tokens] == norms
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text,norm', [("radicalised", "radicalized"), ("cuz", "because")])
|
||||
def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[0].norm_ == norm
|
||||
|
|
|
@ -383,3 +383,14 @@ mixin annotation-row(annots, style)
|
|||
else
|
||||
+cell=cell
|
||||
block
|
||||
|
||||
|
||||
//- Table of contents, to be used with +item mixins for links
|
||||
col - [string] width of column (see +grid-col)
|
||||
|
||||
mixin table-of-contents(col)
|
||||
+grid-col(col || "half")
|
||||
+infobox
|
||||
+label.o-block-small Table of contents
|
||||
+list("numbers").u-text-small.o-no-block
|
||||
block
|
||||
|
|
|
@ -5,31 +5,50 @@ include ../../_includes/_mixins
|
|||
p
|
||||
| Adding full support for a language touches many different parts of the
|
||||
| spaCy library. This guide explains how to fit everything together, and
|
||||
| points you to the specific workflows for each component. Obviously,
|
||||
| there are lots of ways you can organise your code when you implement
|
||||
| your own #[+api("language") #[code Language]] class. This guide will
|
||||
| focus on how it's done within spaCy. For full language support, we'll
|
||||
| need to:
|
||||
| points you to the specific workflows for each component.
|
||||
|
||||
+list("numbers")
|
||||
+item
|
||||
| Create a #[strong #[code Language] subclass].
|
||||
+item
|
||||
| Define custom #[strong language data], like a stop list and tokenizer
|
||||
| exceptions.
|
||||
+item
|
||||
| #[strong Test] the new language tokenizer.
|
||||
+item
|
||||
| #[strong Build the vocabulary], including word frequencies, Brown
|
||||
| clusters and word vectors.
|
||||
+item
|
||||
| Set up a #[strong model direcory] and #[strong train] the tagger and
|
||||
| parser.
|
||||
+grid.o-no-block
|
||||
+grid-col("half")
|
||||
p
|
||||
| Obviously, there are lots of ways you can organise your code when
|
||||
| you implement your own language data. This guide will focus on
|
||||
| how it's done within spaCy. For full language support, you'll
|
||||
| need to create a #[code Language] subclass, define custom
|
||||
| #[strong language data], like a stop list and tokenizer
|
||||
| exceptions and test the new tokenizer. Once the language is set
|
||||
| up, you can #[strong build the vocabulary], including word
|
||||
| frequencies, Brown clusters and word vectors. Finally, you can
|
||||
| #[strong train the tagger and parser], and save the model to a
|
||||
| directory.
|
||||
|
||||
p
|
||||
p
|
||||
| For some languages, you may also want to develop a solution for
|
||||
| lemmatization and morphological analysis.
|
||||
|
||||
+table-of-contents
|
||||
+item #[+a("#language-subclass") The Language subclass]
|
||||
+item #[+a("#language-data") Adding language data]
|
||||
+item #[+a("#stop-workds") Stop words]
|
||||
+item #[+a("#tokenizer-exceptions") Tokenizer exceptions]
|
||||
+item #[+a("#norm-exceptions") Norm exceptions]
|
||||
+item #[+a("#lex-attrs") Lexical attributes]
|
||||
+item #[+a("#lemmatizer") Lemmatizer]
|
||||
+item #[+a("#tag-map") Tag map]
|
||||
+item #[+a("#morph-rules") Morph rules]
|
||||
+item #[+a("#testing") Testing the tokenizer]
|
||||
+item #[+a("#vocabulary") Building the vocabulary]
|
||||
+item #[+a("#training") Training]
|
||||
|
||||
+aside("Working on spaCy's source")
|
||||
| To add a new language to spaCy, you'll need to
|
||||
| #[strong modify the library's code]. The easiest way to do this is to
|
||||
| clone the #[+src(gh("spaCy")) repository] and #[strong build spaCy from source].
|
||||
| For more information on this, see the #[+a("/docs/usage") installation guide].
|
||||
| Unlike spaCy's core, which is mostly written in Cython, all language
|
||||
| data is stored in regular Python files. This means that you won't have to
|
||||
| rebuild anything in between – you can simply make edits and reload spaCy
|
||||
| to test them.
|
||||
|
||||
+h(2, "language-subclass") Creating a #[code Language] subclass
|
||||
|
||||
p
|
||||
|
@ -123,6 +142,14 @@ p
|
|||
| Special-case rules for the tokenizer, for example, contractions
|
||||
| and abbreviations containing punctuation.
|
||||
|
||||
+row
|
||||
+cell #[+src(gh("spaCy", "spacy/lang/norm_exceptions.py")) norm_exceptions.py]
|
||||
+cell
|
||||
| #[code NORM_EXCEPTIONS] (dict)
|
||||
+cell
|
||||
| Special-case rules for normalising tokens and assigning norms,
|
||||
| for example American vs. British spelling.
|
||||
|
||||
+row
|
||||
+cell #[+src(gh("spaCy", "spacy/lang/punctuation.py")) punctuation.py]
|
||||
+cell
|
||||
|
@ -235,7 +262,7 @@ p
|
|||
TOKENIZER_EXCEPTIONS = {
|
||||
"don't": [
|
||||
{ORTH: "do", LEMMA: "do"},
|
||||
{ORTH: "n't", LEMMA: "not", TAG: "RB"}]
|
||||
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
|
||||
}
|
||||
|
||||
+infobox("Important note")
|
||||
|
@ -303,13 +330,74 @@ p
|
|||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
# {"a.": [{ORTH: "a.", LEMMA: "all"}], ":)": [{ORTH: ":)"}]}
|
||||
|
||||
//-+aside("About spaCy's custom pronoun lemma")
|
||||
+infobox("About spaCy's custom pronoun lemma")
|
||||
| Unlike verbs and common nouns, there's no clear base form of a personal
|
||||
| pronoun. Should the lemma of "me" be "I", or should we normalize person
|
||||
| as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
|
||||
| novel symbol, #[code.u-nowrap -PRON-], which is used as the lemma for
|
||||
| all personal pronouns.
|
||||
|
||||
+h(3, "norm-exceptions") Norm exceptions
|
||||
|
||||
p
|
||||
| In addition to #[code ORTH] or #[code LEMMA], tokenizer exceptions can
|
||||
| also set a #[code NORM] attribute. This is useful to specify a normalised
|
||||
| version of the token – for example, the norm of "n't" is "not". By default,
|
||||
| a token's norm equals its lowercase text. If the lowercase spelling of a
|
||||
| word exists, norms should always be in lowercase.
|
||||
|
||||
+aside-code("Accessing norms").
|
||||
doc = nlp(u"I can't")
|
||||
assert [t.norm_ for t in doc] == ['i', 'can', 'not']
|
||||
|
||||
p
|
||||
| spaCy usually tries to normalise words with different spellings to a single,
|
||||
| common spelling. This has no effect on any other token attributes, or
|
||||
| tokenization in general, but it ensures that
|
||||
| #[strong equivalent tokens receive similar representations]. This can
|
||||
| improve the model's predictions on words that weren't common in the
|
||||
| training data, but are equivalent to other words – for example, "realize"
|
||||
| and "realise", or "thx" and "thanks".
|
||||
|
||||
p
|
||||
| Similarly, spaCy also includes
|
||||
| #[+src(gh("spaCy", "spacy/lang/norm_exceptions.py")) global base norms]
|
||||
| for normalising different styles of quotation marks and currency
|
||||
| symbols. Even though #[code $] and #[code €] are very different, spaCy
|
||||
| normalises them both to #[code $]. This way, they'll always be seen as
|
||||
| similar, no matter how common they were in the training data.
|
||||
|
||||
p
|
||||
| Norm exceptions can be provided as a simple dictionary. For more examples,
|
||||
| see the English
|
||||
| #[+src(gh("spaCy", "spacy/lang/en/norm_exceptions.py")) norm_exceptions.py].
|
||||
|
||||
+code("Example").
|
||||
NORM_EXCEPTIONS = {
|
||||
"cos": "because",
|
||||
"fav": "favorite",
|
||||
"accessorise": "accessorize",
|
||||
"accessorised": "accessorized"
|
||||
}
|
||||
|
||||
p
|
||||
| To add the custom norm exceptions lookup table, you can use the
|
||||
| #[code add_lookups()] helper functions. It takes the default attribute
|
||||
| getter function as its first argument, plus a variable list of
|
||||
| dictionaries. If a string's norm is found in one of the dictionaries,
|
||||
| that value is used – otherwise, the default function is called and the
|
||||
| token is assigned its default norm.
|
||||
|
||||
+code.
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
||||
NORM_EXCEPTIONS, BASE_NORMS)
|
||||
|
||||
p
|
||||
| The order of the dictionaries is also the lookup order – so if your
|
||||
| language's norm exceptions overwrite any of the global exceptions, they
|
||||
| should be added first. Also note that the tokenizer exceptions will
|
||||
| always have priority over the atrribute getters.
|
||||
|
||||
+h(3, "lex-attrs") Lexical attributes
|
||||
|
||||
p
|
||||
|
|
|
@ -38,10 +38,7 @@ p
|
|||
| #[strong natural language understanding] systems, or to
|
||||
| pre-process text for #[strong deep learning].
|
||||
|
||||
+grid-col("half")
|
||||
+infobox
|
||||
+label.o-block-small Table of contents
|
||||
+list("numbers").u-text-small.o-no-block
|
||||
+table-of-contents
|
||||
+item #[+a("#features") Features]
|
||||
+item #[+a("#annotations") Linguistic annotations]
|
||||
+item #[+a("#annotations-token") Tokenization]
|
||||
|
|
Loading…
Reference in New Issue
Block a user