Reduce stored lexemes data, move feats to lookups (#5238)

* Reduce stored lexemes data, move feats to lookups

* Move non-derivable lexemes features (`norm / cluster / prob`) to
`spacy-lookups-data` as lookups
  * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups
  * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in
    lookups only
* Remove serialization of lexemes data as `vocab/lexemes.bin`
  * Remove `SerializedLexemeC`
  * Remove `Lexeme.to_bytes/from_bytes`
* Modify normalization exception loading:
  * Always create `Vocab.lookups` table `lexeme_norm` for
    normalization exceptions
  * Load base exceptions from `lang.norm_exceptions`, but load
    language-specific exceptions from lookups
  * Set `lex_attr_getter[NORM]` including new lookups table in
    `BaseDefaults.create_vocab()` and when deserializing `Vocab`
* Remove all cached lexemes when deserializing vocab to override
  existing normalizations with the new normalizations (as a replacement
  for the previous step that replaced all lexemes data with the
  deserialized data)

* Skip English normalization test

Skip English normalization test because the data is now in
`spacy-lookups-data`.

* Remove norm exceptions

Moved to spacy-lookups-data.

* Move norm exceptions test to spacy-lookups-data

* Load extra lookups from spacy-lookups-data lazily

Load extra lookups (currently for cluster and prob) lazily from the
entry point `lg_extra` as `Vocab.lookups_extra`.

* Skip creating lexeme cache on load

To improve model loading times, do not create the full lexeme cache when
loading. The lexemes will be created on demand when processing.

* Identify numeric values in Lexeme.set_attrs()

With the removal of a special case for `PROB`, also identify `float` to
avoid trying to convert it with the `StringStore`.

* Skip lexeme cache init in from_bytes

* Unskip and update lookups tests for python3.6+

* Update vocab pickle to include lookups_extra

* Update vocab serialization tests

Check strings rather than lexemes since lexemes aren't initialized
automatically, account for addition of "_SP".

* Re-skip lookups test because of python3.5

* Skip PROB/float values in Lexeme.set_attrs

* Convert is_oov from lexeme flag to lex in vectors

Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether
the lexeme has a vector.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
adrianeboyd 2020-05-19 15:59:14 +02:00 committed by GitHub
parent a41e28ceba
commit a5cd203284
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
45 changed files with 161 additions and 6182 deletions

View File

@ -15,7 +15,7 @@ cdef enum attr_id_t:
LIKE_NUM LIKE_NUM
LIKE_EMAIL LIKE_EMAIL
IS_STOP IS_STOP
IS_OOV IS_OOV_DEPRECATED
IS_BRACKET IS_BRACKET
IS_QUOTE IS_QUOTE
IS_LEFT_PUNCT IS_LEFT_PUNCT

View File

@ -16,7 +16,7 @@ IDS = {
"LIKE_NUM": LIKE_NUM, "LIKE_NUM": LIKE_NUM,
"LIKE_EMAIL": LIKE_EMAIL, "LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP, "IS_STOP": IS_STOP,
"IS_OOV": IS_OOV, "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
"IS_BRACKET": IS_BRACKET, "IS_BRACKET": IS_BRACKET,
"IS_QUOTE": IS_QUOTE, "IS_QUOTE": IS_QUOTE,
"IS_LEFT_PUNCT": IS_LEFT_PUNCT, "IS_LEFT_PUNCT": IS_LEFT_PUNCT,

View File

@ -157,15 +157,11 @@ def create_model(lang, lex_attrs, name=None):
nlp = lang_class() nlp = lang_class()
for lexeme in nlp.vocab: for lexeme in nlp.vocab:
lexeme.rank = OOV_RANK lexeme.rank = OOV_RANK
lex_added = 0
for attrs in lex_attrs: for attrs in lex_attrs:
if "settings" in attrs: if "settings" in attrs:
continue continue
lexeme = nlp.vocab[attrs["orth"]] lexeme = nlp.vocab[attrs["orth"]]
lexeme.set_attrs(**attrs) lexeme.set_attrs(**attrs)
lexeme.is_oov = False
lex_added += 1
lex_added += 1
if len(nlp.vocab): if len(nlp.vocab):
oov_prob = min(lex.prob for lex in nlp.vocab) - 1 oov_prob = min(lex.prob for lex in nlp.vocab) - 1
else: else:
@ -193,8 +189,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
if vector_keys is not None: if vector_keys is not None:
for word in vector_keys: for word in vector_keys:
if word not in nlp.vocab: if word not in nlp.vocab:
lexeme = nlp.vocab[word] nlp.vocab[word]
lexeme.is_oov = False
if vectors_data is not None: if vectors_data is not None:
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
if name is None: if name is None:

View File

@ -15,7 +15,6 @@ import random
from .._ml import create_default_optimizer from .._ml import create_default_optimizer
from ..util import use_gpu as set_gpu from ..util import use_gpu as set_gpu
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
from ..gold import GoldCorpus from ..gold import GoldCorpus
from ..compat import path2str from ..compat import path2str
from .. import util from .. import util
@ -630,15 +629,6 @@ def _create_progress_bar(total):
def _load_vectors(nlp, vectors): def _load_vectors(nlp, vectors):
util.load_model(vectors, vocab=nlp.vocab) util.load_model(vectors, vocab=nlp.vocab)
for lex in nlp.vocab:
values = {}
for attr, func in nlp.vocab.lex_attr_getters.items():
# These attrs are expected to be set by data. Others should
# be set by calling the language functions.
if attr not in (CLUSTER, PROB, IS_OOV, LANG):
values[lex.vocab.strings[attr]] = func(lex.orth_)
lex.set_attrs(**values)
lex.is_oov = False
def _load_pretrained_tok2vec(nlp, loc): def _load_pretrained_tok2vec(nlp, loc):

View File

@ -2,7 +2,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
@ -12,17 +11,14 @@ from ..tag_map import TAG_MAP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...attrs import LANG
from ...util import update_exc, add_lookups from ...util import update_exc
class DanishDefaults(Language.Defaults): class DanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "da" lex_attr_getters[LANG] = lambda text: "da"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
morph_rules = MORPH_RULES morph_rules = MORPH_RULES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES

View File

@ -1,527 +0,0 @@
# coding: utf8
"""
Special-case rules for normalizing tokens to improve the model's predictions.
For example 'mysterium' vs 'mysterie' and similar.
"""
from __future__ import unicode_literals
# Sources:
# 1: https://dsn.dk/retskrivning/om-retskrivningsordbogen/mere-om-retskrivningsordbogen-2012/endrede-stave-og-ordformer/
# 2: http://www.tjerry-korrektur.dk/ord-med-flere-stavemaader/
_exc = {
# Alternative spelling
"a-kraft-værk": "a-kraftværk", # 1
"ålborg": "aalborg", # 2
"århus": "aarhus",
"accessoirer": "accessoires", # 1
"affektert": "affekteret", # 1
"afrikander": "afrikaaner", # 1
"aftabuere": "aftabuisere", # 1
"aftabuering": "aftabuisering", # 1
"akvarium": "akvarie", # 1
"alenefader": "alenefar", # 1
"alenemoder": "alenemor", # 1
"alkoholambulatorium": "alkoholambulatorie", # 1
"ambulatorium": "ambulatorie", # 1
"ananassene": "ananasserne", # 2
"anførelsestegn": "anførselstegn", # 1
"anseelig": "anselig", # 2
"antioxydant": "antioxidant", # 1
"artrig": "artsrig", # 1
"auditorium": "auditorie", # 1
"avocado": "avokado", # 2
"bagerst": "bagest", # 2
"bagstræv": "bagstræb", # 1
"bagstræver": "bagstræber", # 1
"bagstræverisk": "bagstræberisk", # 1
"balde": "balle", # 2
"barselorlov": "barselsorlov", # 1
"barselvikar": "barselsvikar", # 1
"baskien": "baskerlandet", # 1
"bayrisk": "bayersk", # 1
"bedstefader": "bedstefar", # 1
"bedstemoder": "bedstemor", # 1
"behefte": "behæfte", # 1
"beheftelse": "behæftelse", # 1
"bidragydende": "bidragsydende", # 1
"bidragyder": "bidragsyder", # 1
"billiondel": "billiontedel", # 1
"blaseret": "blasert", # 1
"bleskifte": "bleskift", # 1
"blodbroder": "blodsbroder", # 2
"blyantspidser": "blyantsspidser", # 2
"boligministerium": "boligministerie", # 1
"borhul": "borehul", # 1
"broder": "bror", # 2
"buldog": "bulldog", # 2
"bådhus": "bådehus", # 1
"børnepleje": "barnepleje", # 1
"børneseng": "barneseng", # 1
"børnestol": "barnestol", # 1
"cairo": "kairo", # 1
"cambodia": "cambodja", # 1
"cambodianer": "cambodjaner", # 1
"cambodiansk": "cambodjansk", # 1
"camouflage": "kamuflage", # 2
"campylobacter": "kampylobakter", # 1
"centeret": "centret", # 2
"chefskahyt": "chefkahyt", # 1
"chefspost": "chefpost", # 1
"chefssekretær": "chefsekretær", # 1
"chefsstol": "chefstol", # 1
"cirkulærskrivelse": "cirkulæreskrivelse", # 1
"cognacsglas": "cognacglas", # 1
"columnist": "kolumnist", # 1
"cricket": "kricket", # 2
"dagplejemoder": "dagplejemor", # 1
"damaskesdug": "damaskdug", # 1
"damp-barn": "dampbarn", # 1
"delfinarium": "delfinarie", # 1
"dentallaboratorium": "dentallaboratorie", # 1
"diaramme": "diasramme", # 1
"diaré": "diarré", # 1
"dioxyd": "dioxid", # 1
"dommedagsprædiken": "dommedagspræken", # 1
"donut": "doughnut", # 2
"driftmæssig": "driftsmæssig", # 1
"driftsikker": "driftssikker", # 1
"driftsikring": "driftssikring", # 1
"drikkejogurt": "drikkeyoghurt", # 1
"drivein": "drive-in", # 1
"driveinbiograf": "drive-in-biograf", # 1
"drøvel": "drøbel", # 1
"dødskriterium": "dødskriterie", # 1
"e-mail-adresse": "e-mailadresse", # 1
"e-post-adresse": "e-postadresse", # 1
"egypten": "ægypten", # 2
"ekskommunicere": "ekskommunikere", # 1
"eksperimentarium": "eksperimentarie", # 1
"elsass": "Alsace", # 1
"elsasser": "alsacer", # 1
"elsassisk": "alsacisk", # 1
"elvetal": "ellevetal", # 1
"elvetiden": "ellevetiden", # 1
"elveårig": "elleveårig", # 1
"elveårs": "elleveårs", # 1
"elveårsbarn": "elleveårsbarn", # 1
"elvte": "ellevte", # 1
"elvtedel": "ellevtedel", # 1
"energiministerium": "energiministerie", # 1
"erhvervsministerium": "erhvervsministerie", # 1
"espaliere": "spaliere", # 2
"evangelium": "evangelie", # 1
"fagministerium": "fagministerie", # 1
"fakse": "faxe", # 1
"fangstkvota": "fangstkvote", # 1
"fader": "far", # 2
"farbroder": "farbror", # 1
"farfader": "farfar", # 1
"farmoder": "farmor", # 1
"federal": "føderal", # 1
"federalisering": "føderalisering", # 1
"federalisme": "føderalisme", # 1
"federalist": "føderalist", # 1
"federalistisk": "føderalistisk", # 1
"federation": "føderation", # 1
"federativ": "føderativ", # 1
"fejlbeheftet": "fejlbehæftet", # 1
"femetagers": "femetages", # 2
"femhundredekroneseddel": "femhundredkroneseddel", # 2
"filmpremiere": "filmpræmiere", # 2
"finansimperium": "finansimperie", # 1
"finansministerium": "finansministerie", # 1
"firehjulstræk": "firhjulstræk", # 2
"fjernstudium": "fjernstudie", # 1
"formalier": "formalia", # 1
"formandsskift": "formandsskifte", # 1
"fornemst": "fornemmest", # 2
"fornuftparti": "fornuftsparti", # 1
"fornuftstridig": "fornuftsstridig", # 1
"fornuftvæsen": "fornuftsvæsen", # 1
"fornuftægteskab": "fornuftsægteskab", # 1
"forretningsministerium": "forretningsministerie", # 1
"forskningsministerium": "forskningsministerie", # 1
"forstudium": "forstudie", # 1
"forsvarsministerium": "forsvarsministerie", # 1
"frilægge": "fritlægge", # 1
"frilæggelse": "fritlæggelse", # 1
"frilægning": "fritlægning", # 1
"fristille": "fritstille", # 1
"fristilling": "fritstilling", # 1
"fuldttegnet": "fuldtegnet", # 1
"fødestedskriterium": "fødestedskriterie", # 1
"fødevareministerium": "fødevareministerie", # 1
"følesløs": "følelsesløs", # 1
"følgeligt": "følgelig", # 1
"førne": "førn", # 1
"gearskift": "gearskifte", # 2
"gladeligt": "gladelig", # 1
"glosehefte": "glosehæfte", # 1
"glædeløs": "glædesløs", # 1
"gonoré": "gonorré", # 1
"grangiveligt": "grangivelig", # 1
"grundliggende": "grundlæggende", # 2
"grønsag": "grøntsag", # 2
"gudbenådet": "gudsbenådet", # 1
"gudfader": "gudfar", # 1
"gudmoder": "gudmor", # 1
"gulvmop": "gulvmoppe", # 1
"gymnasium": "gymnasie", # 1
"hackning": "hacking", # 1
"halvbroder": "halvbror", # 1
"halvelvetiden": "halvellevetiden", # 1
"handelsgymnasium": "handelsgymnasie", # 1
"hefte": "hæfte", # 1
"hefteklamme": "hæfteklamme", # 1
"heftelse": "hæftelse", # 1
"heftemaskine": "hæftemaskine", # 1
"heftepistol": "hæftepistol", # 1
"hefteplaster": "hæfteplaster", # 1
"heftestraf": "hæftestraf", # 1
"heftning": "hæftning", # 1
"helbroder": "helbror", # 1
"hjemmeklasse": "hjemklasse", # 1
"hjulspin": "hjulspind", # 1
"huggevåben": "hugvåben", # 1
"hulmurisolering": "hulmursisolering", # 1
"hurtiggående": "hurtigtgående", # 2
"hurtigttørrende": "hurtigtørrende", # 2
"husmoder": "husmor", # 1
"hydroxyd": "hydroxid", # 1
"håndmikser": "håndmixer", # 1
"højtaler": "højttaler", # 2
"hønemoder": "hønemor", # 1
"ide": "idé", # 2
"imperium": "imperie", # 1
"imponerthed": "imponerethed", # 1
"inbox": "indboks", # 2
"indenrigsministerium": "indenrigsministerie", # 1
"indhefte": "indhæfte", # 1
"indheftning": "indhæftning", # 1
"indicium": "indicie", # 1
"indkassere": "inkassere", # 2
"iota": "jota", # 1
"jobskift": "jobskifte", # 1
"jogurt": "yoghurt", # 1
"jukeboks": "jukebox", # 1
"justitsministerium": "justitsministerie", # 1
"kalorifere": "kalorifer", # 1
"kandidatstipendium": "kandidatstipendie", # 1
"kannevas": "kanvas", # 1
"kaperssauce": "kaperssovs", # 1
"kigge": "kikke", # 2
"kirkeministerium": "kirkeministerie", # 1
"klapmydse": "klapmyds", # 1
"klimakterium": "klimakterie", # 1
"klogeligt": "klogelig", # 1
"knivblad": "knivsblad", # 1
"kollegaer": "kolleger", # 2
"kollegium": "kollegie", # 1
"kollegiehefte": "kollegiehæfte", # 1
"kollokviumx": "kollokvium", # 1
"kommissorium": "kommissorie", # 1
"kompendium": "kompendie", # 1
"komplicerthed": "komplicerethed", # 1
"konfederation": "konføderation", # 1
"konfedereret": "konfødereret", # 1
"konferensstudium": "konferensstudie", # 1
"konservatorium": "konservatorie", # 1
"konsulere": "konsultere", # 1
"kradsbørstig": "krasbørstig", # 2
"kravsspecifikation": "kravspecifikation", # 1
"krematorium": "krematorie", # 1
"krep": "crepe", # 1
"krepnylon": "crepenylon", # 1
"kreppapir": "crepepapir", # 1
"kricket": "cricket", # 2
"kriterium": "kriterie", # 1
"kroat": "kroater", # 2
"kroki": "croquis", # 1
"kronprinsepar": "kronprinspar", # 2
"kropdoven": "kropsdoven", # 1
"kroplus": "kropslus", # 1
"krøllefedt": "krølfedt", # 1
"kulturministerium": "kulturministerie", # 1
"kuponhefte": "kuponhæfte", # 1
"kvota": "kvote", # 1
"kvotaordning": "kvoteordning", # 1
"laboratorium": "laboratorie", # 1
"laksfarve": "laksefarve", # 1
"laksfarvet": "laksefarvet", # 1
"laksrød": "lakserød", # 1
"laksyngel": "lakseyngel", # 1
"laksørred": "lakseørred", # 1
"landbrugsministerium": "landbrugsministerie", # 1
"landskampstemning": "landskampsstemning", # 1
"langust": "languster", # 1
"lappegrejer": "lappegrej", # 1
"lavløn": "lavtløn", # 1
"lillebroder": "lillebror", # 1
"linear": "lineær", # 1
"loftlampe": "loftslampe", # 2
"log-in": "login", # 1
"login": "log-in", # 2
"lovmedholdig": "lovmedholdelig", # 1
"ludder": "luder", # 2
"lysholder": "lyseholder", # 1
"lægeskifte": "lægeskift", # 1
"lærvillig": "lærevillig", # 1
"løgsauce": "løgsovs", # 1
"madmoder": "madmor", # 1
"majonæse": "mayonnaise", # 1
"mareridtagtig": "mareridtsagtig", # 1
"margen": "margin", # 2
"martyrium": "martyrie", # 1
"mellemstatlig": "mellemstatslig", # 1
"menneskene": "menneskerne", # 2
"metropolis": "metropol", # 1
"miks": "mix", # 1
"mikse": "mixe", # 1
"miksepult": "mixerpult", # 1
"mikser": "mixer", # 1
"mikserpult": "mixerpult", # 1
"mikslån": "mixlån", # 1
"miksning": "mixning", # 1
"miljøministerium": "miljøministerie", # 1
"milliarddel": "milliardtedel", # 1
"milliondel": "milliontedel", # 1
"ministerium": "ministerie", # 1
"mop": "moppe", # 1
"moder": "mor", # 2
"moratorium": "moratorie", # 1
"morbroder": "morbror", # 1
"morfader": "morfar", # 1
"mormoder": "mormor", # 1
"musikkonservatorium": "musikkonservatorie", # 1
"muslingskal": "muslingeskal", # 1
"mysterium": "mysterie", # 1
"naturalieydelse": "naturalydelse", # 1
"naturalieøkonomi": "naturaløkonomi", # 1
"navnebroder": "navnebror", # 1
"nerium": "nerie", # 1
"nådeløs": "nådesløs", # 1
"nærforestående": "nærtforestående", # 1
"nærstående": "nærtstående", # 1
"observatorium": "observatorie", # 1
"oldefader": "oldefar", # 1
"oldemoder": "oldemor", # 1
"opgraduere": "opgradere", # 1
"opgraduering": "opgradering", # 1
"oratorium": "oratorie", # 1
"overbookning": "overbooking", # 1
"overpræsidium": "overpræsidie", # 1
"overstatlig": "overstatslig", # 1
"oxyd": "oxid", # 1
"oxydere": "oxidere", # 1
"oxydering": "oxidering", # 1
"pakkenellike": "pakkenelliker", # 1
"papirtynd": "papirstynd", # 1
"pastoralseminarium": "pastoralseminarie", # 1
"peanutsene": "peanuttene", # 2
"penalhus": "pennalhus", # 2
"pensakrav": "pensumkrav", # 1
"pepperoni": "peperoni", # 1
"peruaner": "peruvianer", # 1
"petrole": "petrol", # 1
"piltast": "piletast", # 1
"piltaste": "piletast", # 1
"planetarium": "planetarie", # 1
"plasteret": "plastret", # 2
"plastic": "plastik", # 2
"play-off-kamp": "playoffkamp", # 1
"plejefader": "plejefar", # 1
"plejemoder": "plejemor", # 1
"podium": "podie", # 2
"praha": "prag", # 2
"preciøs": "pretiøs", # 2
"privilegium": "privilegie", # 1
"progredere": "progrediere", # 1
"præsidium": "præsidie", # 1
"psykodelisk": "psykedelisk", # 1
"pudsegrejer": "pudsegrej", # 1
"referensgruppe": "referencegruppe", # 1
"referensramme": "referenceramme", # 1
"refugium": "refugie", # 1
"registeret": "registret", # 2
"remedium": "remedie", # 1
"remiks": "remix", # 1
"reservert": "reserveret", # 1
"ressortministerium": "ressortministerie", # 1
"ressource": "resurse", # 2
"resætte": "resette", # 1
"rettelig": "retteligt", # 1
"rettetaste": "rettetast", # 1
"returtaste": "returtast", # 1
"risici": "risikoer", # 2
"roll-on": "rollon", # 1
"rollehefte": "rollehæfte", # 1
"rostbøf": "roastbeef", # 1
"rygsæksturist": "rygsækturist", # 1
"rødstjært": "rødstjert", # 1
"saddel": "sadel", # 2
"samaritan": "samaritaner", # 2
"sanatorium": "sanatorie", # 1
"sauce": "sovs", # 1
"scanning": "skanning", # 2
"sceneskifte": "sceneskift", # 1
"scilla": "skilla", # 1
"sejflydende": "sejtflydende", # 1
"selvstudium": "selvstudie", # 1
"seminarium": "seminarie", # 1
"sennepssauce": "sennepssovs ", # 1
"servitutbeheftet": "servitutbehæftet", # 1
"sit-in": "sitin", # 1
"skatteministerium": "skatteministerie", # 1
"skifer": "skiffer", # 2
"skyldsfølelse": "skyldfølelse", # 1
"skysauce": "skysovs", # 1
"sladdertaske": "sladretaske", # 2
"sladdervorn": "sladrevorn", # 2
"slagsbroder": "slagsbror", # 1
"slettetaste": "slettetast", # 1
"smørsauce": "smørsovs", # 1
"snitsel": "schnitzel", # 1
"snobbeeffekt": "snobeffekt", # 2
"socialministerium": "socialministerie", # 1
"solarium": "solarie", # 1
"soldebroder": "soldebror", # 1
"spagetti": "spaghetti", # 1
"spagettistrop": "spaghettistrop", # 1
"spagettiwestern": "spaghettiwestern", # 1
"spin-off": "spinoff", # 1
"spinnefiskeri": "spindefiskeri", # 1
"spolorm": "spoleorm", # 1
"sproglaboratorium": "sproglaboratorie", # 1
"spækbræt": "spækkebræt", # 2
"stand-in": "standin", # 1
"stand-up-comedy": "standupcomedy", # 1
"stand-up-komiker": "standupkomiker", # 1
"statsministerium": "statsministerie", # 1
"stedbroder": "stedbror", # 1
"stedfader": "stedfar", # 1
"stedmoder": "stedmor", # 1
"stilehefte": "stilehæfte", # 1
"stipendium": "stipendie", # 1
"stjært": "stjert", # 1
"stjærthage": "stjerthage", # 1
"storebroder": "storebror", # 1
"stortå": "storetå", # 1
"strabads": "strabadser", # 1
"strømlinjet": "strømlinet", # 1
"studium": "studie", # 1
"stænkelap": "stænklap", # 1
"sundhedsministerium": "sundhedsministerie", # 1
"suppositorium": "suppositorie", # 1
"svejts": "schweiz", # 1
"svejtser": "schweizer", # 1
"svejtserfranc": "schweizerfranc", # 1
"svejtserost": "schweizerost", # 1
"svejtsisk": "schweizisk", # 1
"svigerfader": "svigerfar", # 1
"svigermoder": "svigermor", # 1
"svirebroder": "svirebror", # 1
"symposium": "symposie", # 1
"sælarium": "sælarie", # 1
"søreme": "sørme", # 2
"søterritorium": "søterritorie", # 1
"t-bone-steak": "t-bonesteak", # 1
"tabgivende": "tabsgivende", # 1
"tabuere": "tabuisere", # 1
"tabuering": "tabuisering", # 1
"tackle": "takle", # 2
"tackling": "takling", # 2
"taifun": "tyfon", # 1
"take-off": "takeoff", # 1
"taknemlig": "taknemmelig", # 2
"talehørelærer": "tale-høre-lærer", # 1
"talehøreundervisning": "tale-høre-undervisning", # 1
"tandstik": "tandstikker", # 1
"tao": "dao", # 1
"taoisme": "daoisme", # 1
"taoist": "daoist", # 1
"taoistisk": "daoistisk", # 1
"taverne": "taverna", # 1
"teateret": "teatret", # 2
"tekno": "techno", # 1
"temposkifte": "temposkift", # 1
"terrarium": "terrarie", # 1
"territorium": "territorie", # 1
"tesis": "tese", # 1
"tidsstudium": "tidsstudie", # 1
"tipoldefader": "tipoldefar", # 1
"tipoldemoder": "tipoldemor", # 1
"tomatsauce": "tomatsovs", # 1
"tonart": "toneart", # 1
"trafikministerium": "trafikministerie", # 1
"tredve": "tredive", # 1
"tredver": "trediver", # 1
"tredveårig": "trediveårig", # 1
"tredveårs": "trediveårs", # 1
"tredveårsfødselsdag": "trediveårsfødselsdag", # 1
"tredvte": "tredivte", # 1
"tredvtedel": "tredivtedel", # 1
"troldunge": "troldeunge", # 1
"trommestikke": "trommestik", # 1
"trubadur": "troubadour", # 2
"trøstepræmie": "trøstpræmie", # 2
"tummerum": "trummerum", # 1
"tumultuarisk": "tumultarisk", # 1
"tunghørighed": "tunghørhed", # 1
"tus": "tusch", # 2
"tusind": "tusinde", # 2
"tvillingbroder": "tvillingebror", # 1
"tvillingbror": "tvillingebror", # 1
"tvillingebroder": "tvillingebror", # 1
"ubeheftet": "ubehæftet", # 1
"udenrigsministerium": "udenrigsministerie", # 1
"udhulning": "udhuling", # 1
"udslaggivende": "udslagsgivende", # 1
"udspekulert": "udspekuleret", # 1
"udviklingsministerium": "udviklingsministerie", # 1
"uforpligtigende": "uforpligtende", # 1
"uheldvarslende": "uheldsvarslende", # 1
"uimponerthed": "uimponerethed", # 1
"undervisningsministerium": "undervisningsministerie", # 1
"unægtelig": "unægteligt", # 1
"urinale": "urinal", # 1
"uvederheftig": "uvederhæftig", # 1
"vabel": "vable", # 2
"vadi": "wadi", # 1
"vaklevorn": "vakkelvorn", # 1
"vanadin": "vanadium", # 1
"vaselin": "vaseline", # 1
"vederheftig": "vederhæftig", # 1
"vedhefte": "vedhæfte", # 1
"velar": "velær", # 1
"videndeling": "vidensdeling", # 2
"vinkelanførelsestegn": "vinkelanførselstegn", # 1
"vipstjært": "vipstjert", # 1
"vismut": "bismut", # 1
"visvas": "vissevasse", # 1
"voksværk": "vokseværk", # 1
"værtdyr": "værtsdyr", # 1
"værtplante": "værtsplante", # 1
"wienersnitsel": "wienerschnitzel", # 1
"yderliggående": "yderligtgående", # 2
"zombi": "zombie", # 1
"ægbakke": "æggebakke", # 1
"ægformet": "æggeformet", # 1
"ægleder": "æggeleder", # 1
"ækvilibrist": "ekvilibrist", # 2
"æselsøre": "æseløre", # 1
"øjehule": "øjenhule", # 1
"øjelåg": "øjenlåg", # 1
"øjeåbner": "øjenåbner", # 1
"økonomiministerium": "økonomiministerie", # 1
"ørenring": "ørering", # 2
"øvehefte": "øvehæfte", # 1
}
NORM_EXCEPTIONS = {}
for string, norm in _exc.items():
NORM_EXCEPTIONS[string] = norm
NORM_EXCEPTIONS[string.title()] = norm

View File

@ -2,7 +2,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
@ -10,18 +9,14 @@ from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...attrs import LANG
from ...util import update_exc, add_lookups from ...util import update_exc
class GermanDefaults(Language.Defaults): class GermanDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "de" lex_attr_getters[LANG] = lambda text: "de"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -1,16 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
# Here we only want to include the absolute most common words. Otherwise,
# this list would get impossibly long for German especially considering the
# old vs. new spelling rules, and all possible cases.
_exc = {"daß": "dass"}
NORM_EXCEPTIONS = {}
for string, norm in _exc.items():
NORM_EXCEPTIONS[string] = norm
NORM_EXCEPTIONS[string.title()] = norm

View File

@ -10,21 +10,16 @@ from .lemmatizer import GreekLemmatizer
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lookups import Lookups from ...lookups import Lookups
from ...attrs import LANG, NORM from ...attrs import LANG
from ...util import update_exc, add_lookups from ...util import update_exc
class GreekDefaults(Language.Defaults): class GreekDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "el" lex_attr_getters[LANG] = lambda text: "el"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
tag_map = TAG_MAP tag_map = TAG_MAP

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
@ -10,10 +9,9 @@ from .morph_rules import MORPH_RULES
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...attrs import LANG
from ...util import update_exc, add_lookups from ...util import update_exc
def _return_en(_): def _return_en(_):
@ -24,9 +22,6 @@ class EnglishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = _return_en lex_attr_getters[LANG] = _return_en
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP tag_map = TAG_MAP
stop_words = STOP_WORDS stop_words = STOP_WORDS

File diff suppressed because it is too large Load Diff

View File

@ -4,25 +4,20 @@ from __future__ import unicode_literals
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...attrs import LANG
from ...util import update_exc, add_lookups from ...util import update_exc
class IndonesianDefaults(Language.Defaults): class IndonesianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "id" lex_attr_getters[LANG] = lambda text: "id"
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES

View File

@ -1,532 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
# Daftar kosakata yang sering salah dieja
# https://id.wikipedia.org/wiki/Wikipedia:Daftar_kosakata_bahasa_Indonesia_yang_sering_salah_dieja
_exc = {
# Slang and abbreviations
"silahkan": "silakan",
"yg": "yang",
"kalo": "kalau",
"cawu": "caturwulan",
"ok": "oke",
"gak": "tidak",
"enggak": "tidak",
"nggak": "tidak",
"ndak": "tidak",
"ngga": "tidak",
"dgn": "dengan",
"tdk": "tidak",
"jg": "juga",
"klo": "kalau",
"denger": "dengar",
"pinter": "pintar",
"krn": "karena",
"nemuin": "menemukan",
"jgn": "jangan",
"udah": "sudah",
"sy": "saya",
"udh": "sudah",
"dapetin": "mendapatkan",
"ngelakuin": "melakukan",
"ngebuat": "membuat",
"membikin": "membuat",
"bikin": "buat",
# Daftar kosakata yang sering salah dieja
"malpraktik": "malapraktik",
"malfungsi": "malafungsi",
"malserap": "malaserap",
"maladaptasi": "malaadaptasi",
"malsuai": "malasuai",
"maldistribusi": "maladistribusi",
"malgizi": "malagizi",
"malsikap": "malasikap",
"memperhatikan": "memerhatikan",
"akte": "akta",
"cemilan": "camilan",
"esei": "esai",
"frase": "frasa",
"kafeteria": "kafetaria",
"ketapel": "katapel",
"kenderaan": "kendaraan",
"menejemen": "manajemen",
"menejer": "manajer",
"mesjid": "masjid",
"rebo": "rabu",
"seksama": "saksama",
"senggama": "sanggama",
"sekedar": "sekadar",
"seprei": "seprai",
"semedi": "semadi",
"samadi": "semadi",
"amandemen": "amendemen",
"algoritma": "algoritme",
"aritmatika": "aritmetika",
"metoda": "metode",
"materai": "meterai",
"meterei": "meterai",
"kalendar": "kalender",
"kadaluwarsa": "kedaluwarsa",
"katagori": "kategori",
"parlamen": "parlemen",
"sekular": "sekuler",
"selular": "seluler",
"sirkular": "sirkuler",
"survai": "survei",
"survey": "survei",
"aktuil": "aktual",
"formil": "formal",
"trotoir": "trotoar",
"komersiil": "komersial",
"komersil": "komersial",
"tradisionil": "tradisionial",
"orisinil": "orisinal",
"orijinil": "orisinal",
"afdol": "afdal",
"antri": "antre",
"apotik": "apotek",
"atlit": "atlet",
"atmosfir": "atmosfer",
"cidera": "cedera",
"cendikiawan": "cendekiawan",
"cepet": "cepat",
"cinderamata": "cenderamata",
"debet": "debit",
"difinisi": "definisi",
"dekrit": "dekret",
"disain": "desain",
"diskripsi": "deskripsi",
"diskotik": "diskotek",
"eksim": "eksem",
"exim": "eksem",
"faidah": "faedah",
"ekstrim": "ekstrem",
"ekstrimis": "ekstremis",
"komplit": "komplet",
"konkrit": "konkret",
"kongkrit": "konkret",
"kongkret": "konkret",
"kridit": "kredit",
"musium": "museum",
"pinalti": "penalti",
"piranti": "peranti",
"pinsil": "pensil",
"personil": "personel",
"sistim": "sistem",
"teoritis": "teoretis",
"vidio": "video",
"cengkeh": "cengkih",
"desertasi": "disertasi",
"hakekat": "hakikat",
"intelejen": "intelijen",
"kaedah": "kaidah",
"kempes": "kempis",
"kementrian": "kementerian",
"ledeng": "leding",
"nasehat": "nasihat",
"penasehat": "penasihat",
"praktek": "praktik",
"praktekum": "praktikum",
"resiko": "risiko",
"retsleting": "ritsleting",
"senen": "senin",
"amuba": "ameba",
"punggawa": "penggawa",
"surban": "serban",
"nomer": "nomor",
"sorban": "serban",
"bis": "bus",
"agribisnis": "agrobisnis",
"kantung": "kantong",
"khutbah": "khotbah",
"mandur": "mandor",
"rubuh": "roboh",
"pastur": "pastor",
"supir": "sopir",
"goncang": "guncang",
"goa": "gua",
"kaos": "kaus",
"kokoh": "kukuh",
"komulatif": "kumulatif",
"kolomnis": "kolumnis",
"korma": "kurma",
"lobang": "lubang",
"limo": "limusin",
"limosin": "limusin",
"mangkok": "mangkuk",
"saos": "saus",
"sop": "sup",
"sorga": "surga",
"tegor": "tegur",
"telor": "telur",
"obrak-abrik": "ubrak-abrik",
"ekwivalen": "ekuivalen",
"frekwensi": "frekuensi",
"konsekwensi": "konsekuensi",
"kwadran": "kuadran",
"kwadrat": "kuadrat",
"kwalifikasi": "kualifikasi",
"kwalitas": "kualitas",
"kwalitet": "kualitas",
"kwalitatif": "kualitatif",
"kwantitas": "kuantitas",
"kwantitatif": "kuantitatif",
"kwantum": "kuantum",
"kwartal": "kuartal",
"kwintal": "kuintal",
"kwitansi": "kuitansi",
"kwatir": "khawatir",
"kuatir": "khawatir",
"jadual": "jadwal",
"hirarki": "hierarki",
"karir": "karier",
"aktip": "aktif",
"daptar": "daftar",
"efektip": "efektif",
"epektif": "efektif",
"epektip": "efektif",
"Pebruari": "Februari",
"pisik": "fisik",
"pondasi": "fondasi",
"photo": "foto",
"photokopi": "fotokopi",
"hapal": "hafal",
"insap": "insaf",
"insyaf": "insaf",
"konperensi": "konferensi",
"kreatip": "kreatif",
"kreativ": "kreatif",
"maap": "maaf",
"napsu": "nafsu",
"negatip": "negatif",
"negativ": "negatif",
"objektip": "objektif",
"obyektip": "objektif",
"obyektif": "objektif",
"pasip": "pasif",
"pasiv": "pasif",
"positip": "positif",
"positiv": "positif",
"produktip": "produktif",
"produktiv": "produktif",
"sarap": "saraf",
"sertipikat": "sertifikat",
"subjektip": "subjektif",
"subyektip": "subjektif",
"subyektif": "subjektif",
"tarip": "tarif",
"transitip": "transitif",
"transitiv": "transitif",
"faham": "paham",
"fikir": "pikir",
"berfikir": "berpikir",
"telefon": "telepon",
"telfon": "telepon",
"telpon": "telepon",
"tilpon": "telepon",
"nafas": "napas",
"bernafas": "bernapas",
"pernafasan": "pernapasan",
"vermak": "permak",
"vulpen": "pulpen",
"aktifis": "aktivis",
"konfeksi": "konveksi",
"motifasi": "motivasi",
"Nopember": "November",
"propinsi": "provinsi",
"babtis": "baptis",
"jerembab": "jerembap",
"lembab": "lembap",
"sembab": "sembap",
"saptu": "sabtu",
"tekat": "tekad",
"bejad": "bejat",
"nekad": "nekat",
"otoped": "otopet",
"skuad": "skuat",
"jenius": "genius",
"marjin": "margin",
"marjinal": "marginal",
"obyek": "objek",
"subyek": "subjek",
"projek": "proyek",
"azas": "asas",
"ijasah": "ijazah",
"jenasah": "jenazah",
"plasa": "plaza",
"bathin": "batin",
"Katholik": "Katolik",
"orthografi": "ortografi",
"pathogen": "patogen",
"theologi": "teologi",
"ijin": "izin",
"rejeki": "rezeki",
"rejim": "rezim",
"jaman": "zaman",
"jamrud": "zamrud",
"jinah": "zina",
"perjinahan": "perzinaan",
"anugrah": "anugerah",
"cendrawasih": "cenderawasih",
"jendral": "jenderal",
"kripik": "keripik",
"krupuk": "kerupuk",
"ksatria": "kesatria",
"mentri": "menteri",
"negri": "negeri",
"Prancis": "Perancis",
"sebrang": "seberang",
"menyebrang": "menyeberang",
"Sumatra": "Sumatera",
"trampil": "terampil",
"isteri": "istri",
"justeru": "justru",
"perajurit": "prajurit",
"putera": "putra",
"puteri": "putri",
"samudera": "samudra",
"sastera": "sastra",
"sutera": "sutra",
"terompet": "trompet",
"iklas": "ikhlas",
"iktisar": "ikhtisar",
"kafilah": "khafilah",
"kawatir": "khawatir",
"kotbah": "khotbah",
"kusyuk": "khusyuk",
"makluk": "makhluk",
"mahluk": "makhluk",
"mahkluk": "makhluk",
"nahkoda": "nakhoda",
"nakoda": "nakhoda",
"tahta": "takhta",
"takhyul": "takhayul",
"tahyul": "takhayul",
"tahayul": "takhayul",
"akhli": "ahli",
"anarkhi": "anarki",
"kharisma": "karisma",
"kharismatik": "karismatik",
"mahsud": "maksud",
"makhsud": "maksud",
"rakhmat": "rahmat",
"tekhnik": "teknik",
"tehnik": "teknik",
"tehnologi": "teknologi",
"ikhwal": "ihwal",
"expor": "ekspor",
"extra": "ekstra",
"komplex": "komplek",
"sex": "seks",
"taxi": "taksi",
"extasi": "ekstasi",
"syaraf": "saraf",
"syurga": "surga",
"mashur": "masyhur",
"masyur": "masyhur",
"mahsyur": "masyhur",
"mashyur": "masyhur",
"muadzin": "muazin",
"adzan": "azan",
"ustadz": "ustaz",
"ustad": "ustaz",
"ustadzah": "ustaz",
"dzikir": "zikir",
"dzuhur": "zuhur",
"dhuhur": "zuhur",
"zhuhur": "zuhur",
"analisa": "analisis",
"diagnosa": "diagnosis",
"hipotesa": "hipotesis",
"sintesa": "sintesis",
"aktiviti": "aktivitas",
"aktifitas": "aktivitas",
"efektifitas": "efektivitas",
"komuniti": "komunitas",
"kreatifitas": "kreativitas",
"produktifitas": "produktivitas",
"realiti": "realitas",
"realita": "realitas",
"selebriti": "selebritas",
"spotifitas": "sportivitas",
"universiti": "universitas",
"utiliti": "utilitas",
"validiti": "validitas",
"dilokalisir": "dilokalisasi",
"didramatisir": "didramatisasi",
"dipolitisir": "dipolitisasi",
"dinetralisir": "dinetralisasi",
"dikonfrontir": "dikonfrontasi",
"mendominir": "mendominasi",
"koordinir": "koordinasi",
"proklamir": "proklamasi",
"terorganisir": "terorganisasi",
"terealisir": "terealisasi",
"robah": "ubah",
"dirubah": "diubah",
"merubah": "mengubah",
"terlanjur": "telanjur",
"terlantar": "telantar",
"penglepasan": "pelepasan",
"pelihatan": "penglihatan",
"pemukiman": "permukiman",
"pengrumahan": "perumahan",
"penyewaan": "persewaan",
"menyintai": "mencintai",
"menyolok": "mencolok",
"contek": "sontek",
"mencontek": "menyontek",
"pungkir": "mungkir",
"dipungkiri": "dimungkiri",
"kupungkiri": "kumungkiri",
"kaupungkiri": "kaumungkiri",
"nampak": "tampak",
"nampaknya": "tampaknya",
"nongkrong": "tongkrong",
"berternak": "beternak",
"berterbangan": "beterbangan",
"berserta": "beserta",
"berperkara": "beperkara",
"berpergian": "bepergian",
"berkerja": "bekerja",
"berberapa": "beberapa",
"terbersit": "tebersit",
"terpercaya": "tepercaya",
"terperdaya": "teperdaya",
"terpercik": "tepercik",
"terpergok": "tepergok",
"aksesoris": "aksesori",
"handal": "andal",
"hantar": "antar",
"panutan": "anutan",
"atsiri": "asiri",
"bhakti": "bakti",
"china": "cina",
"dharma": "darma",
"diktaktor": "diktator",
"eksport": "ekspor",
"hembus": "embus",
"hadits": "hadis",
"hadist": "hadits",
"harafiah": "harfiah",
"himbau": "imbau",
"import": "impor",
"inget": "ingat",
"hisap": "isap",
"interprestasi": "interpretasi",
"kangker": "kanker",
"konggres": "kongres",
"lansekap": "lanskap",
"maghrib": "magrib",
"emak": "mak",
"moderen": "modern",
"pasport": "paspor",
"perduli": "peduli",
"ramadhan": "ramadan",
"rapih": "rapi",
"Sansekerta": "Sanskerta",
"shalat": "salat",
"sholat": "salat",
"silahkan": "silakan",
"standard": "standar",
"hutang": "utang",
"zinah": "zina",
"ambulan": "ambulans",
"antartika": "sntarktika",
"arteri": "arteria",
"asik": "asyik",
"australi": "australia",
"denga": "dengan",
"depo": "depot",
"detil": "detail",
"ensiklopedi": "ensiklopedia",
"elit": "elite",
"frustasi": "frustrasi",
"gladi": "geladi",
"greget": "gereget",
"itali": "italia",
"karna": "karena",
"klenteng": "kelenteng",
"erling": "kerling",
"kontruksi": "konstruksi",
"masal": "massal",
"merk": "merek",
"respon": "respons",
"diresponi": "direspons",
"skak": "sekak",
"stir": "setir",
"singapur": "singapura",
"standarisasi": "standardisasi",
"varitas": "varietas",
"amphibi": "amfibi",
"anjlog": "anjlok",
"alpukat": "avokad",
"alpokat": "avokad",
"bolpen": "pulpen",
"cabe": "cabai",
"cabay": "cabai",
"ceret": "cerek",
"differensial": "diferensial",
"duren": "durian",
"faksimili": "faksimile",
"faksimil": "faksimile",
"graha": "gerha",
"goblog": "goblok",
"gombrong": "gombroh",
"horden": "gorden",
"korden": "gorden",
"gubug": "gubuk",
"imaginasi": "imajinasi",
"jerigen": "jeriken",
"jirigen": "jeriken",
"carut-marut": "karut-marut",
"kwota": "kuota",
"mahzab": "mazhab",
"mempesona": "memesona",
"milyar": "miliar",
"missi": "misi",
"nenas": "nanas",
"negoisasi": "negosiasi",
"automotif": "otomotif",
"pararel": "paralel",
"paska": "pasca",
"prosen": "persen",
"pete": "petai",
"petay": "petai",
"proffesor": "profesor",
"rame": "ramai",
"rapot": "rapor",
"rileks": "relaks",
"rileksasi": "relaksasi",
"renumerasi": "remunerasi",
"seketaris": "sekretaris",
"sekertaris": "sekretaris",
"sensorik": "sensoris",
"sentausa": "sentosa",
"strawberi": "stroberi",
"strawbery": "stroberi",
"taqwa": "takwa",
"tauco": "taoco",
"tauge": "taoge",
"toge": "taoge",
"tauladan": "teladan",
"taubat": "tobat",
"trilyun": "triliun",
"vissi": "visi",
"coklat": "cokelat",
"narkotika": "narkotik",
"oase": "oasis",
"politisi": "politikus",
"terong": "terung",
"wool": "wol",
"himpit": "impit",
"mujizat": "mukjizat",
"mujijat": "mukjizat",
"yag": "yang",
}
NORM_EXCEPTIONS = {}
for string, norm in _exc.items():
NORM_EXCEPTIONS[string] = norm
NORM_EXCEPTIONS[string.title()] = norm

View File

@ -2,26 +2,21 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...attrs import LANG
from ...util import update_exc, add_lookups from ...util import update_exc
class LuxembourgishDefaults(Language.Defaults): class LuxembourgishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "lb" lex_attr_getters[LANG] = lambda text: "lb"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
tag_map = TAG_MAP tag_map = TAG_MAP

View File

@ -1,16 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
# TODO
# norm execptions: find a possibility to deal with the zillions of spelling
# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
# here one could include the most common spelling mistakes
_exc = {"dass": "datt", "viläicht": "vläicht"}
NORM_EXCEPTIONS = {}
for string, norm in _exc.items():
NORM_EXCEPTIONS[string] = norm
NORM_EXCEPTIONS[string.title()] = norm

View File

@ -186,10 +186,6 @@ def suffix(string):
return string[-3:] return string[-3:]
def cluster(string):
return 0
def is_alpha(string): def is_alpha(string):
return string.isalpha() return string.isalpha()
@ -218,20 +214,11 @@ def is_stop(string, stops=set()):
return string.lower() in stops return string.lower() in stops
def is_oov(string):
return True
def get_prob(string):
return -20.0
LEX_ATTRS = { LEX_ATTRS = {
attrs.LOWER: lower, attrs.LOWER: lower,
attrs.NORM: lower, attrs.NORM: lower,
attrs.PREFIX: prefix, attrs.PREFIX: prefix,
attrs.SUFFIX: suffix, attrs.SUFFIX: suffix,
attrs.CLUSTER: cluster,
attrs.IS_ALPHA: is_alpha, attrs.IS_ALPHA: is_alpha,
attrs.IS_DIGIT: is_digit, attrs.IS_DIGIT: is_digit,
attrs.IS_LOWER: is_lower, attrs.IS_LOWER: is_lower,
@ -239,8 +226,6 @@ LEX_ATTRS = {
attrs.IS_TITLE: is_title, attrs.IS_TITLE: is_title,
attrs.IS_UPPER: is_upper, attrs.IS_UPPER: is_upper,
attrs.IS_STOP: is_stop, attrs.IS_STOP: is_stop,
attrs.IS_OOV: is_oov,
attrs.PROB: get_prob,
attrs.LIKE_EMAIL: like_email, attrs.LIKE_EMAIL: like_email,
attrs.LIKE_NUM: like_num, attrs.LIKE_NUM: like_num,
attrs.IS_PUNCT: is_punct, attrs.IS_PUNCT: is_punct,

View File

@ -5,22 +5,17 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .norm_exceptions import NORM_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...attrs import LANG
from ...util import update_exc, add_lookups from ...util import update_exc
class PortugueseDefaults(Language.Defaults): class PortugueseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "pt" lex_attr_getters[LANG] = lambda text: "pt"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
)
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,23 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
# These exceptions are used to add NORM values based on a token's ORTH value.
# Individual languages can also add their own exceptions and overwrite them -
# for example, British vs. American spelling in English.
# Norms are only set if no alternative is provided in the tokenizer exceptions.
# Note that this does not change any other token attributes. Its main purpose
# is to normalise the word representations so that equivalent tokens receive
# similar representations. For example: $ and € are very different, but they're
# both currency symbols. By normalising currency symbols to $, all symbols are
# seen as similar, no matter how common they are in the training data.
NORM_EXCEPTIONS = {
"R$": "$", # Real
"r$": "$", # Real
"Cz$": "$", # Cruzado
"cz$": "$", # Cruzado
"NCz$": "$", # Cruzado Novo
"ncz$": "$", # Cruzado Novo
}

View File

@ -3,26 +3,21 @@ from __future__ import unicode_literals, print_function
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .lemmatizer import RussianLemmatizer from .lemmatizer import RussianLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ...util import update_exc
from ...util import update_exc, add_lookups
from ...language import Language from ...language import Language
from ...lookups import Lookups from ...lookups import Lookups
from ...attrs import LANG, NORM from ...attrs import LANG
class RussianDefaults(Language.Defaults): class RussianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "ru" lex_attr_getters[LANG] = lambda text: "ru"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
tag_map = TAG_MAP tag_map = TAG_MAP

View File

@ -1,36 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
_exc = {
# Slang
"прив": "привет",
"дарова": "привет",
"дак": "так",
"дык": "так",
"здарова": "привет",
"пакедава": "пока",
"пакедаво": "пока",
"ща": "сейчас",
"спс": "спасибо",
"пжлст": "пожалуйста",
"плиз": "пожалуйста",
"ладненько": "ладно",
"лады": "ладно",
"лан": "ладно",
"ясн": "ясно",
"всм": "всмысле",
"хош": "хочешь",
"хаюшки": "привет",
"оч": "очень",
"че": "что",
"чо": "что",
"шо": "что",
}
NORM_EXCEPTIONS = {}
for string, norm in _exc.items():
NORM_EXCEPTIONS[string] = norm
NORM_EXCEPTIONS[string.title()] = norm

View File

@ -3,22 +3,17 @@ from __future__ import unicode_literals
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...attrs import LANG
from ...util import update_exc, add_lookups from ...util import update_exc
class SerbianDefaults(Language.Defaults): class SerbianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "sr" lex_attr_getters[LANG] = lambda text: "sr"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,26 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
_exc = {
# Slang
"ћале": "отац",
"кева": "мајка",
"смор": "досада",
"кец": "јединица",
"тебра": "брат",
"штребер": "ученик",
"факс": "факултет",
"профа": "професор",
"бус": "аутобус",
"пискарало": "службеник",
"бакутанер": "бака",
"џибер": "простак",
}
NORM_EXCEPTIONS = {}
for string, norm in _exc.items():
NORM_EXCEPTIONS[string] = norm
NORM_EXCEPTIONS[string.title()] = norm

View File

@ -1,139 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
_exc = {
# Regional words normal
# Sri Lanka - wikipeadia
"இங்க": "இங்கே",
"வாங்க": "வாருங்கள்",
"ஒண்டு": "ஒன்று",
"கண்டு": "கன்று",
"கொண்டு": "கொன்று",
"பண்டி": "பன்றி",
"பச்ச": "பச்சை",
"அம்பது": "ஐம்பது",
"வெச்ச": "வைத்து",
"வச்ச": "வைத்து",
"வச்சி": "வைத்து",
"வாளைப்பழம்": "வாழைப்பழம்",
"மண்ணு": "மண்",
"பொன்னு": "பொன்",
"சாவல்": "சேவல்",
"அங்கால": "அங்கு ",
"அசுப்பு": "நடமாட்டம்",
"எழுவான் கரை": "எழுவான்கரை",
"ஓய்யாரம்": "எழில் ",
"ஒளும்பு": "எழும்பு",
"ஓர்மை": "துணிவு",
"கச்சை": "கோவணம்",
"கடப்பு": "தெருவாசல்",
"சுள்ளி": "காய்ந்த குச்சி",
"திறாவுதல்": "தடவுதல்",
"நாசமறுப்பு": "தொல்லை",
"பரிசாரி": "வைத்தியன்",
"பறவாதி": "பேராசைக்காரன்",
"பிசினி": "உலோபி ",
"விசர்": "பைத்தியம்",
"ஏனம்": "பாத்திரம்",
"ஏலா": "இயலாது",
"ஒசில்": "அழகு",
"ஒள்ளுப்பம்": "கொஞ்சம்",
# Srilankan and indian
"குத்துமதிப்பு": "",
"நூனாயம்": "நூல்நயம்",
"பைய": "மெதுவாக",
"மண்டை": "தலை",
"வெள்ளனே": "சீக்கிரம்",
"உசுப்பு": "எழுப்பு",
"ஆணம்": "குழம்பு",
"உறக்கம்": "தூக்கம்",
"பஸ்": "பேருந்து",
"களவு": "திருட்டு ",
# relationship
"புருசன்": "கணவன்",
"பொஞ்சாதி": "மனைவி",
"புள்ள": "பிள்ளை",
"பிள்ள": "பிள்ளை",
"ஆம்பிளப்புள்ள": "ஆண் பிள்ளை",
"பொம்பிளப்புள்ள": "பெண் பிள்ளை",
"அண்ணாச்சி": "அண்ணா",
"அக்காச்சி": "அக்கா",
"தங்கச்சி": "தங்கை",
# difference words
"பொடியன்": "சிறுவன்",
"பொட்டை": "சிறுமி",
"பிறகு": "பின்பு",
"டக்கென்டு": "விரைவாக",
"கெதியா": "விரைவாக",
"கிறுகி": "திரும்பி",
"போயித்து வாறன்": "போய் வருகிறேன்",
"வருவாங்களா": "வருவார்களா",
# regular spokens
"சொல்லு": "சொல்",
"கேளு": "கேள்",
"சொல்லுங்க": "சொல்லுங்கள்",
"கேளுங்க": "கேளுங்கள்",
"நீங்கள்": "நீ",
"உன்": "உன்னுடைய",
# Portugeese formal words
"அலவாங்கு": "கடப்பாரை",
"ஆசுப்பத்திரி": "மருத்துவமனை",
"உரோதை": "சில்லு",
"கடுதாசி": "கடிதம்",
"கதிரை": "நாற்காலி",
"குசினி": "அடுக்களை",
"கோப்பை": "கிண்ணம்",
"சப்பாத்து": "காலணி",
"தாச்சி": "இரும்புச் சட்டி",
"துவாய்": "துவாலை",
"தவறணை": "மதுக்கடை",
"பீப்பா": "மரத்தாழி",
"யன்னல்": "சாளரம்",
"வாங்கு": "மரஇருக்கை",
# Dutch formal words
"இறாக்கை": "பற்சட்டம்",
"இலாட்சி": "இழுப்பறை",
"கந்தோர்": "பணிமனை",
"நொத்தாரிசு": "ஆவண எழுத்துபதிவாளர்",
# English formal words
"இஞ்சினியர்": "பொறியியலாளர்",
"சூப்பு": "ரசம்",
"செக்": "காசோலை",
"சேட்டு": "மேற்ச்சட்டை",
"மார்க்கட்டு": "சந்தை",
"விண்ணன்": "கெட்டிக்காரன்",
# Arabic formal words
"ஈமான்": "நம்பிக்கை",
"சுன்னத்து": "விருத்தசேதனம்",
"செய்த்தான்": "பிசாசு",
"மவுத்து": "இறப்பு",
"ஹலால்": "அங்கீகரிக்கப்பட்டது",
"கறாம்": "நிராகரிக்கப்பட்டது",
# Persian, Hindustanian and hindi formal words
"சுமார்": "கிட்டத்தட்ட",
"சிப்பாய்": "போர்வீரன்",
"சிபார்சு": "சிபாரிசு",
"ஜமீன்": "பணக்காரா்",
"அசல்": "மெய்யான",
"அந்தஸ்து": "கௌரவம்",
"ஆஜர்": "சமா்ப்பித்தல்",
"உசார்": "எச்சரிக்கை",
"அச்சா": "நல்ல",
# English words used in text conversations
"bcoz": "ஏனெனில்",
"bcuz": "ஏனெனில்",
"fav": "விருப்பமான",
"morning": "காலை வணக்கம்",
"gdeveng": "மாலை வணக்கம்",
"gdnyt": "இரவு வணக்கம்",
"gdnit": "இரவு வணக்கம்",
"plz": "தயவு செய்து",
"pls": "தயவு செய்து",
"thx": "நன்றி",
"thanx": "நன்றி",
}
NORM_EXCEPTIONS = {}
for string, norm in _exc.items():
NORM_EXCEPTIONS[string] = norm

View File

@ -4,14 +4,12 @@ from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .norm_exceptions import NORM_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..norm_exceptions import BASE_NORMS from ...attrs import LANG
from ...attrs import LANG, NORM
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from ...util import DummyTokenizer, add_lookups from ...util import DummyTokenizer
class ThaiTokenizer(DummyTokenizer): class ThaiTokenizer(DummyTokenizer):
@ -37,9 +35,6 @@ class ThaiDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda _text: "th" lex_attr_getters[LANG] = lambda _text: "th"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
)
tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS) tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP tag_map = TAG_MAP
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,113 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
_exc = {
# Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์)
"สนุ๊กเกอร์": "สนุกเกอร์",
"โน้ต": "โน้ต",
# Misspelled because of being lazy or hustle (สะกดผิดเพราะขี้เกียจพิมพ์ หรือเร่งรีบ)
"โทสับ": "โทรศัพท์",
"พุ่งนี้": "พรุ่งนี้",
# Strange (ให้ดูแปลกตา)
"ชะมะ": "ใช่ไหม",
"ชิมิ": "ใช่ไหม",
"ชะ": "ใช่ไหม",
"ช่ายมะ": "ใช่ไหม",
"ป่าว": "เปล่า",
"ป่ะ": "เปล่า",
"ปล่าว": "เปล่า",
"คัย": "ใคร",
"ไค": "ใคร",
"คราย": "ใคร",
"เตง": "ตัวเอง",
"ตะเอง": "ตัวเอง",
"รึ": "หรือ",
"เหรอ": "หรือ",
"หรา": "หรือ",
"หรอ": "หรือ",
"ชั้น": "ฉัน",
"ชั้ล": "ฉัน",
"ช้าน": "ฉัน",
"เทอ": "เธอ",
"เทอร์": "เธอ",
"เทอว์": "เธอ",
"แกร": "แก",
"ป๋ม": "ผม",
"บ่องตง": "บอกตรงๆ",
"ถ่ามตง": "ถามตรงๆ",
"ต่อมตง": "ตอบตรงๆ",
"เพิ่ล": "เพื่อน",
"จอบอ": "จอบอ",
"ดั้ย": "ได้",
"ขอบคุง": "ขอบคุณ",
"ยังงัย": "ยังไง",
"Inw": "เทพ",
"uou": "นอน",
"Lกรีeu": "เกรียน",
# Misspelled to express emotions (คำที่สะกดผิดเพื่อแสดงอารมณ์)
"เปงราย": "เป็นอะไร",
"เปนรัย": "เป็นอะไร",
"เปงรัย": "เป็นอะไร",
"เป็นอัลไล": "เป็นอะไร",
"ทามมาย": "ทำไม",
"ทามมัย": "ทำไม",
"จังรุย": "จังเลย",
"จังเยย": "จังเลย",
"จุงเบย": "จังเลย",
"ไม่รู้": "มะรุ",
"เฮ่ย": "เฮ้ย",
"เห้ย": "เฮ้ย",
"น่าร็อค": "น่ารัก",
"น่าร๊าก": "น่ารัก",
"ตั้ลล๊าก": "น่ารัก",
"คือร๊ะ": "คืออะไร",
"โอป่ะ": "โอเคหรือเปล่า",
"น่ามคาน": "น่ารำคาญ",
"น่ามสาร": "น่าสงสาร",
"วงวาร": "สงสาร",
"บับว่า": "แบบว่า",
"อัลไล": "อะไร",
"อิจ": "อิจฉา",
# Reduce rough words or Avoid to software filter (คำที่สะกดผิดเพื่อลดความหยาบของคำ หรืออาจใช้หลีกเลี่ยงการกรองคำหยาบของซอฟต์แวร์)
"กรู": "กู",
"กุ": "กู",
"กรุ": "กู",
"ตู": "กู",
"ตรู": "กู",
"มรึง": "มึง",
"เมิง": "มึง",
"มืง": "มึง",
"มุง": "มึง",
"สาด": "สัตว์",
"สัส": "สัตว์",
"สัก": "สัตว์",
"แสรด": "สัตว์",
"โคโตะ": "โคตร",
"โคด": "โคตร",
"โครต": "โคตร",
"โคตะระ": "โคตร",
"พ่อง": "พ่อมึง",
"แม่เมิง": "แม่มึง",
"เชี่ย": "เหี้ย",
# Imitate words (คำเลียนเสียง โดยส่วนใหญ่จะเพิ่มทัณฑฆาต หรือซ้ำตัวอักษร)
"แอร๊ยย": "อ๊าย",
"อร๊ายยย": "อ๊าย",
"มันส์": "มัน",
"วู๊วววววววว์": "วู้",
# Acronym (แบบคำย่อ)
"หมาลัย": "มหาวิทยาลัย",
"วิดวะ": "วิศวะ",
"สินสาด ": "ศิลปศาสตร์",
"สินกำ ": "ศิลปกรรมศาสตร์",
"เสารีย์ ": "อนุเสาวรีย์ชัยสมรภูมิ",
"เมกา ": "อเมริกา",
"มอไซค์ ": "มอเตอร์ไซค์",
}
NORM_EXCEPTIONS = {}
for string, norm in _exc.items():
NORM_EXCEPTIONS[string] = norm
NORM_EXCEPTIONS[string.title()] = norm

View File

@ -28,10 +28,11 @@ from .compat import izip, basestring_, is_python2, class_types
from .gold import GoldParse from .gold import GoldParse
from .scorer import Scorer from .scorer import Scorer
from ._ml import link_vectors_to_models, create_default_optimizer from ._ml import link_vectors_to_models, create_default_optimizer
from .attrs import IS_STOP, LANG from .attrs import IS_STOP, LANG, NORM
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES from .lang.punctuation import TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.norm_exceptions import BASE_NORMS
from .lang.tag_map import TAG_MAP from .lang.tag_map import TAG_MAP
from .tokens import Doc from .tokens import Doc
from .lang.lex_attrs import LEX_ATTRS, is_stop from .lang.lex_attrs import LEX_ATTRS, is_stop
@ -77,6 +78,9 @@ class BaseDefaults(object):
lemmatizer=lemmatizer, lemmatizer=lemmatizer,
lookups=lookups, lookups=lookups,
) )
vocab.lex_attr_getters[NORM] = util.add_lookups(
vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, vocab.lookups.get_table("lexeme_norm")
)
for tag_str, exc in cls.morph_rules.items(): for tag_str, exc in cls.morph_rules.items():
for orth_str, attrs in exc.items(): for orth_str, attrs in exc.items():
vocab.morphology.add_special_case(tag_str, orth_str, attrs) vocab.morphology.add_special_case(tag_str, orth_str, attrs)

View File

@ -1,8 +1,8 @@
from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
from .attrs cimport attr_id_t from .attrs cimport attr_id_t
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
from .structs cimport LexemeC, SerializedLexemeC from .structs cimport LexemeC
from .strings cimport StringStore from .strings cimport StringStore
from .vocab cimport Vocab from .vocab cimport Vocab
@ -24,22 +24,6 @@ cdef class Lexeme:
self.vocab = vocab self.vocab = vocab
self.orth = lex.orth self.orth = lex.orth
@staticmethod
cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil:
cdef SerializedLexemeC lex_data
buff = <const unsigned char*>&lex.flags
end = <const unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
for i in range(sizeof(lex_data.data)):
lex_data.data[i] = buff[i]
return lex_data
@staticmethod
cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil:
buff = <unsigned char*>&lex.flags
end = <unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
for i in range(sizeof(lex_data.data)):
buff[i] = lex_data.data[i]
@staticmethod @staticmethod
cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil: cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
if name < (sizeof(flags_t) * 8): if name < (sizeof(flags_t) * 8):
@ -56,8 +40,6 @@ cdef class Lexeme:
lex.prefix = value lex.prefix = value
elif name == SUFFIX: elif name == SUFFIX:
lex.suffix = value lex.suffix = value
elif name == CLUSTER:
lex.cluster = value
elif name == LANG: elif name == LANG:
lex.lang = value lex.lang = value
@ -84,8 +66,6 @@ cdef class Lexeme:
return lex.suffix return lex.suffix
elif feat_name == LENGTH: elif feat_name == LENGTH:
return lex.length return lex.length
elif feat_name == CLUSTER:
return lex.cluster
elif feat_name == LANG: elif feat_name == LANG:
return lex.lang return lex.lang
else: else:

View File

@ -17,7 +17,7 @@ from .typedefs cimport attr_t, flags_t
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
from .attrs cimport IS_CURRENCY, IS_OOV, PROB from .attrs cimport IS_CURRENCY
from .attrs import intify_attrs from .attrs import intify_attrs
from .errors import Errors, Warnings from .errors import Errors, Warnings
@ -89,11 +89,10 @@ cdef class Lexeme:
cdef attr_id_t attr cdef attr_id_t attr
attrs = intify_attrs(attrs) attrs = intify_attrs(attrs)
for attr, value in attrs.items(): for attr, value in attrs.items():
if attr == PROB: # skip PROB, e.g. from lexemes.jsonl
self.c.prob = value if isinstance(value, float):
elif attr == CLUSTER: continue
self.c.cluster = int(value) elif isinstance(value, (int, long)):
elif isinstance(value, int) or isinstance(value, long):
Lexeme.set_struct_attr(self.c, attr, value) Lexeme.set_struct_attr(self.c, attr, value)
else: else:
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value)) Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
@ -137,34 +136,6 @@ cdef class Lexeme:
xp = get_array_module(vector) xp = get_array_module(vector)
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
def to_bytes(self):
lex_data = Lexeme.c_to_bytes(self.c)
start = <const char*>&self.c.flags
end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment)
if (end-start) != sizeof(lex_data.data):
raise ValueError(Errors.E072.format(length=end-start,
bad_length=sizeof(lex_data.data)))
byte_string = b"\0" * sizeof(lex_data.data)
byte_chars = <char*>byte_string
for i in range(sizeof(lex_data.data)):
byte_chars[i] = lex_data.data[i]
if len(byte_string) != sizeof(lex_data.data):
raise ValueError(Errors.E072.format(length=len(byte_string),
bad_length=sizeof(lex_data.data)))
return byte_string
def from_bytes(self, bytes byte_string):
# This method doesn't really have a use-case --- wrote it for testing.
# Possibly delete? It puts the Lexeme out of synch with the vocab.
cdef SerializedLexemeC lex_data
if len(byte_string) != sizeof(lex_data.data):
raise ValueError(Errors.E072.format(length=len(byte_string),
bad_length=sizeof(lex_data.data)))
for i in range(len(byte_string)):
lex_data.data[i] = byte_string[i]
Lexeme.c_from_bytes(self.c, lex_data)
self.orth = self.c.orth
@property @property
def has_vector(self): def has_vector(self):
"""RETURNS (bool): Whether a word vector is associated with the object. """RETURNS (bool): Whether a word vector is associated with the object.
@ -208,10 +179,14 @@ cdef class Lexeme:
"""RETURNS (float): A scalar value indicating the positivity or """RETURNS (float): A scalar value indicating the positivity or
negativity of the lexeme.""" negativity of the lexeme."""
def __get__(self): def __get__(self):
return self.c.sentiment sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
return sentiment_table.get(self.c.orth, 0.0)
def __set__(self, float sentiment): def __set__(self, float x):
self.c.sentiment = sentiment if "lexeme_sentiment" not in self.vocab.lookups:
self.vocab.lookups.add_table("lexeme_sentiment")
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
sentiment_table[self.c.orth] = x
@property @property
def orth_(self): def orth_(self):
@ -241,6 +216,10 @@ cdef class Lexeme:
return self.c.norm return self.c.norm
def __set__(self, attr_t x): def __set__(self, attr_t x):
if "lexeme_norm" not in self.vocab.lookups:
self.vocab.lookups.add_table("lexeme_norm")
norm_table = self.vocab.lookups.get_table("lexeme_norm")
norm_table[self.c.orth] = self.vocab.strings[x]
self.c.norm = x self.c.norm = x
property shape: property shape:
@ -276,10 +255,12 @@ cdef class Lexeme:
property cluster: property cluster:
"""RETURNS (int): Brown cluster ID.""" """RETURNS (int): Brown cluster ID."""
def __get__(self): def __get__(self):
return self.c.cluster cluster_table = self.vocab.load_extra_lookups("lexeme_cluster")
return cluster_table.get(self.c.orth, 0)
def __set__(self, attr_t x): def __set__(self, int x):
self.c.cluster = x cluster_table = self.vocab.load_extra_lookups("lexeme_cluster")
cluster_table[self.c.orth] = x
property lang: property lang:
"""RETURNS (uint64): Language of the parent vocabulary.""" """RETURNS (uint64): Language of the parent vocabulary."""
@ -293,10 +274,14 @@ cdef class Lexeme:
"""RETURNS (float): Smoothed log probability estimate of the lexeme's """RETURNS (float): Smoothed log probability estimate of the lexeme's
type.""" type."""
def __get__(self): def __get__(self):
return self.c.prob prob_table = self.vocab.load_extra_lookups("lexeme_prob")
settings_table = self.vocab.load_extra_lookups("lexeme_settings")
default_oov_prob = settings_table.get("oov_prob", -20.0)
return prob_table.get(self.c.orth, default_oov_prob)
def __set__(self, float x): def __set__(self, float x):
self.c.prob = x prob_table = self.vocab.load_extra_lookups("lexeme_prob")
prob_table[self.c.orth] = x
property lower_: property lower_:
"""RETURNS (unicode): Lowercase form of the word.""" """RETURNS (unicode): Lowercase form of the word."""
@ -314,7 +299,7 @@ cdef class Lexeme:
return self.vocab.strings[self.c.norm] return self.vocab.strings[self.c.norm]
def __set__(self, unicode x): def __set__(self, unicode x):
self.c.norm = self.vocab.strings.add(x) self.norm = self.vocab.strings.add(x)
property shape_: property shape_:
"""RETURNS (unicode): Transform of the word's string, to show """RETURNS (unicode): Transform of the word's string, to show
@ -362,13 +347,10 @@ cdef class Lexeme:
def __set__(self, flags_t x): def __set__(self, flags_t x):
self.c.flags = x self.c.flags = x
property is_oov: @property
def is_oov(self):
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary.""" """RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
def __get__(self): return self.orth in self.vocab.vectors
return Lexeme.c_check_flag(self.c, IS_OOV)
def __set__(self, attr_t x):
Lexeme.c_set_flag(self.c, IS_OOV, x)
property is_stop: property is_stop:
"""RETURNS (bool): Whether the lexeme is a stop word.""" """RETURNS (bool): Whether the lexeme is a stop word."""

View File

@ -124,7 +124,7 @@ class Lookups(object):
self._tables[key].update(value) self._tables[key].update(value)
return self return self
def to_disk(self, path, **kwargs): def to_disk(self, path, filename="lookups.bin", **kwargs):
"""Save the lookups to a directory as lookups.bin. Expects a path to a """Save the lookups to a directory as lookups.bin. Expects a path to a
directory, which will be created if it doesn't exist. directory, which will be created if it doesn't exist.
@ -136,11 +136,11 @@ class Lookups(object):
path = ensure_path(path) path = ensure_path(path)
if not path.exists(): if not path.exists():
path.mkdir() path.mkdir()
filepath = path / "lookups.bin" filepath = path / filename
with filepath.open("wb") as file_: with filepath.open("wb") as file_:
file_.write(self.to_bytes()) file_.write(self.to_bytes())
def from_disk(self, path, **kwargs): def from_disk(self, path, filename="lookups.bin", **kwargs):
"""Load lookups from a directory containing a lookups.bin. Will skip """Load lookups from a directory containing a lookups.bin. Will skip
loading if the file doesn't exist. loading if the file doesn't exist.
@ -150,7 +150,7 @@ class Lookups(object):
DOCS: https://spacy.io/api/lookups#from_disk DOCS: https://spacy.io/api/lookups#from_disk
""" """
path = ensure_path(path) path = ensure_path(path)
filepath = path / "lookups.bin" filepath = path / filename
if filepath.exists(): if filepath.exists():
with filepath.open("rb") as file_: with filepath.open("rb") as file_:
data = file_.read() data = file_.read()

View File

@ -23,29 +23,6 @@ cdef struct LexemeC:
attr_t prefix attr_t prefix
attr_t suffix attr_t suffix
attr_t cluster
float prob
float sentiment
cdef struct SerializedLexemeC:
unsigned char[8 + 8*10 + 4 + 4] data
# sizeof(flags_t) # flags
# + sizeof(attr_t) # lang
# + sizeof(attr_t) # id
# + sizeof(attr_t) # length
# + sizeof(attr_t) # orth
# + sizeof(attr_t) # lower
# + sizeof(attr_t) # norm
# + sizeof(attr_t) # shape
# + sizeof(attr_t) # prefix
# + sizeof(attr_t) # suffix
# + sizeof(attr_t) # cluster
# + sizeof(float) # prob
# + sizeof(float) # cluster
# + sizeof(float) # l2_norm
cdef struct SpanC: cdef struct SpanC:
hash_t id hash_t id

View File

@ -12,7 +12,7 @@ cdef enum symbol_t:
LIKE_NUM LIKE_NUM
LIKE_EMAIL LIKE_EMAIL
IS_STOP IS_STOP
IS_OOV IS_OOV_DEPRECATED
IS_BRACKET IS_BRACKET
IS_QUOTE IS_QUOTE
IS_LEFT_PUNCT IS_LEFT_PUNCT

View File

@ -17,7 +17,7 @@ IDS = {
"LIKE_NUM": LIKE_NUM, "LIKE_NUM": LIKE_NUM,
"LIKE_EMAIL": LIKE_EMAIL, "LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP, "IS_STOP": IS_STOP,
"IS_OOV": IS_OOV, "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
"IS_BRACKET": IS_BRACKET, "IS_BRACKET": IS_BRACKET,
"IS_QUOTE": IS_QUOTE, "IS_QUOTE": IS_QUOTE,
"IS_LEFT_PUNCT": IS_LEFT_PUNCT, "IS_LEFT_PUNCT": IS_LEFT_PUNCT,

View File

@ -37,14 +37,6 @@ def test_da_tokenizer_handles_custom_base_exc(da_tokenizer):
assert tokens[7].text == "." assert tokens[7].text == "."
@pytest.mark.parametrize(
"text,norm", [("akvarium", "akvarie"), ("bedstemoder", "bedstemor")]
)
def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm):
tokens = da_tokenizer(text)
assert tokens[0].norm_ == norm
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,n_tokens", "text,n_tokens",
[ [

View File

@ -22,17 +22,3 @@ def test_de_tokenizer_handles_exc_in_text(de_tokenizer):
assert len(tokens) == 6 assert len(tokens) == 6
assert tokens[2].text == "z.Zt." assert tokens[2].text == "z.Zt."
assert tokens[2].lemma_ == "zur Zeit" assert tokens[2].lemma_ == "zur Zeit"
@pytest.mark.parametrize(
"text,norms", [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])]
)
def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
tokens = de_tokenizer(text)
assert [token.norm_ for token in tokens] == norms
@pytest.mark.parametrize("text,norm", [("daß", "dass")])
def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
tokens = de_tokenizer(text)
assert tokens[0].norm_ == norm

View File

@ -118,6 +118,7 @@ def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
assert [token.norm_ for token in tokens] == norms assert [token.norm_ for token in tokens] == norms
@pytest.mark.skip
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,norm", [("radicalised", "radicalized"), ("cuz", "because")] "text,norm", [("radicalised", "radicalized"), ("cuz", "because")]
) )

View File

@ -22,9 +22,3 @@ def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
assert len(tokens) == 9 assert len(tokens) == 9
assert tokens[1].text == "'t" assert tokens[1].text == "'t"
assert tokens[1].lemma_ == "et" assert tokens[1].lemma_ == "et"
@pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")])
def test_lb_norm_exceptions(lb_tokenizer, text, norm):
tokens = lb_tokenizer(text)
assert tokens[0].norm_ == norm

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
import pickle
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.strings import StringStore from spacy.strings import StringStore
@ -36,8 +37,8 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
assert vocab1.to_bytes() == vocab1_b assert vocab1.to_bytes() == vocab1_b
new_vocab1 = Vocab().from_bytes(vocab1_b) new_vocab1 = Vocab().from_bytes(vocab1_b)
assert new_vocab1.to_bytes() == vocab1_b assert new_vocab1.to_bytes() == vocab1_b
assert len(new_vocab1) == len(strings1) assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP
assert sorted([lex.text for lex in new_vocab1]) == sorted(strings1) assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + ["_SP"])
@pytest.mark.parametrize("strings1,strings2", test_strings) @pytest.mark.parametrize("strings1,strings2", test_strings)
@ -51,12 +52,13 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
vocab2.to_disk(file_path2) vocab2.to_disk(file_path2)
vocab1_d = Vocab().from_disk(file_path1) vocab1_d = Vocab().from_disk(file_path1)
vocab2_d = Vocab().from_disk(file_path2) vocab2_d = Vocab().from_disk(file_path2)
assert list(vocab1_d) == list(vocab1) # check strings rather than lexemes, which are only reloaded on demand
assert list(vocab2_d) == list(vocab2) assert strings1 == [s for s in vocab1_d.strings if s != "_SP"]
assert strings2 == [s for s in vocab2_d.strings if s != "_SP"]
if strings1 == strings2: if strings1 == strings2:
assert list(vocab1_d) == list(vocab2_d) assert [s for s in vocab1_d.strings if s != "_SP"] == [s for s in vocab2_d.strings if s != "_SP"]
else: else:
assert list(vocab1_d) != list(vocab2_d) assert [s for s in vocab1_d.strings if s != "_SP"] != [s for s in vocab2_d.strings if s != "_SP"]
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@ -76,7 +78,7 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr):
vocab = Vocab(strings=strings) vocab = Vocab(strings=strings)
length = len(vocab) length = len(vocab)
vocab.from_bytes(vocab.to_bytes()) vocab.from_bytes(vocab.to_bytes())
assert len(vocab) == length assert len(vocab.strings) == len(strings) + 1 # adds _SP
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@ -127,3 +129,11 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
assert list(sstore1_d) == list(sstore2_d) assert list(sstore1_d) == list(sstore2_d)
else: else:
assert list(sstore1_d) != list(sstore2_d) assert list(sstore1_d) != list(sstore2_d)
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
def test_pickle_vocab(strings, lex_attr):
vocab = Vocab(strings=strings)
vocab[strings[0]].norm_ = lex_attr
vocab_pickled = pickle.dumps(vocab)
vocab_unpickled = pickle.loads(vocab_pickled)
assert vocab.to_bytes() == vocab_unpickled.to_bytes()

View File

@ -26,7 +26,7 @@ def test_lemmatizer_reflects_lookups_changes():
nlp_bytes = nlp.to_bytes() nlp_bytes = nlp.to_bytes()
new_nlp.from_bytes(nlp_bytes) new_nlp.from_bytes(nlp_bytes)
# Make sure we have the previously saved lookup table # Make sure we have the previously saved lookup table
assert len(new_nlp.vocab.lookups) == 1 assert "lemma_lookup" in new_nlp.vocab.lookups
assert len(new_nlp.vocab.lookups.get_table("lemma_lookup")) == 2 assert len(new_nlp.vocab.lookups.get_table("lemma_lookup")) == 2
assert new_nlp.vocab.lookups.get_table("lemma_lookup")["hello"] == "world" assert new_nlp.vocab.lookups.get_table("lemma_lookup")["hello"] == "world"
assert Doc(new_nlp.vocab, words=["foo"])[0].lemma_ == "bar" assert Doc(new_nlp.vocab, words=["foo"])[0].lemma_ == "bar"

View File

@ -60,19 +60,6 @@ def test_vocab_lexeme_add_flag_provided_id(en_vocab):
assert en_vocab["dogs"].check_flag(is_len4) is True assert en_vocab["dogs"].check_flag(is_len4) is True
def test_lexeme_bytes_roundtrip(en_vocab):
one = en_vocab["one"]
alpha = en_vocab["alpha"]
assert one.orth != alpha.orth
assert one.lower != alpha.lower
alpha.from_bytes(one.to_bytes())
assert one.orth_ == alpha.orth_
assert one.orth == alpha.orth
assert one.lower == alpha.lower
assert one.lower_ == alpha.lower_
def test_vocab_lexeme_oov_rank(en_vocab): def test_vocab_lexeme_oov_rank(en_vocab):
"""Test that default rank is OOV_RANK.""" """Test that default rank is OOV_RANK."""
lex = en_vocab["word"] lex = en_vocab["word"]

View File

@ -119,12 +119,11 @@ def test_lookups_to_from_bytes_via_vocab():
table_name = "test" table_name = "test"
vocab = Vocab() vocab = Vocab()
vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"}) vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"})
assert len(vocab.lookups) == 1
assert table_name in vocab.lookups assert table_name in vocab.lookups
vocab_bytes = vocab.to_bytes() vocab_bytes = vocab.to_bytes()
new_vocab = Vocab() new_vocab = Vocab()
new_vocab.from_bytes(vocab_bytes) new_vocab.from_bytes(vocab_bytes)
assert len(new_vocab.lookups) == 1 assert len(new_vocab.lookups) == len(vocab.lookups)
assert table_name in new_vocab.lookups assert table_name in new_vocab.lookups
table = new_vocab.lookups.get_table(table_name) table = new_vocab.lookups.get_table(table_name)
assert len(table) == 2 assert len(table) == 2
@ -137,13 +136,12 @@ def test_lookups_to_from_disk_via_vocab():
table_name = "test" table_name = "test"
vocab = Vocab() vocab = Vocab()
vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"}) vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"})
assert len(vocab.lookups) == 1
assert table_name in vocab.lookups assert table_name in vocab.lookups
with make_tempdir() as tmpdir: with make_tempdir() as tmpdir:
vocab.to_disk(tmpdir) vocab.to_disk(tmpdir)
new_vocab = Vocab() new_vocab = Vocab()
new_vocab.from_disk(tmpdir) new_vocab.from_disk(tmpdir)
assert len(new_vocab.lookups) == 1 assert len(new_vocab.lookups) == len(vocab.lookups)
assert table_name in new_vocab.lookups assert table_name in new_vocab.lookups
table = new_vocab.lookups.get_table(table_name) table = new_vocab.lookups.get_table(table_name)
assert len(table) == 2 assert len(table) == 2

View File

@ -329,3 +329,15 @@ def test_vocab_prune_vectors():
neighbour, similarity = list(remap.values())[0] neighbour, similarity = list(remap.values())[0]
assert neighbour == "cat", remap assert neighbour == "cat", remap
assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3) assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)
def test_vector_is_oov():
vocab = Vocab(vectors_name="test_vocab_is_oov")
data = numpy.ndarray((5, 3), dtype="f")
data[0] = 1.0
data[1] = 2.0
vocab.set_vector("cat", data[0])
vocab.set_vector("dog", data[1])
assert vocab["cat"].is_oov is True
assert vocab["dog"].is_oov is True
assert vocab["hamster"].is_oov is False

View File

@ -17,7 +17,7 @@ from ..typedefs cimport hash_t
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
from ..symbols cimport conj from ..symbols cimport conj
@ -259,7 +259,7 @@ cdef class Token:
@property @property
def prob(self): def prob(self):
"""RETURNS (float): Smoothed log probability estimate of token type.""" """RETURNS (float): Smoothed log probability estimate of token type."""
return self.c.lex.prob return self.vocab[self.c.lex.orth].prob
@property @property
def sentiment(self): def sentiment(self):
@ -267,7 +267,7 @@ cdef class Token:
negativity of the token.""" negativity of the token."""
if "sentiment" in self.doc.user_token_hooks: if "sentiment" in self.doc.user_token_hooks:
return self.doc.user_token_hooks["sentiment"](self) return self.doc.user_token_hooks["sentiment"](self)
return self.c.lex.sentiment return self.vocab[self.c.lex.orth].sentiment
@property @property
def lang(self): def lang(self):
@ -286,7 +286,7 @@ cdef class Token:
@property @property
def cluster(self): def cluster(self):
"""RETURNS (int): Brown cluster ID.""" """RETURNS (int): Brown cluster ID."""
return self.c.lex.cluster return self.vocab[self.c.lex.orth].cluster
@property @property
def orth(self): def orth(self):
@ -923,7 +923,7 @@ cdef class Token:
@property @property
def is_oov(self): def is_oov(self):
"""RETURNS (bool): Whether the token is out-of-vocabulary.""" """RETURNS (bool): Whether the token is out-of-vocabulary."""
return Lexeme.c_check_flag(self.c.lex, IS_OOV) return self.c.lex.orth in self.vocab.vectors
@property @property
def is_stop(self): def is_stop(self):

View File

@ -30,6 +30,7 @@ cdef class Vocab:
cpdef public Morphology morphology cpdef public Morphology morphology
cpdef public object vectors cpdef public object vectors
cpdef public object lookups cpdef public object lookups
cpdef public object lookups_extra
cdef readonly int length cdef readonly int length
cdef public object data_dir cdef public object data_dir
cdef public object lex_attr_getters cdef public object lex_attr_getters

View File

@ -11,8 +11,7 @@ from .lexeme cimport EMPTY_LEXEME, OOV_RANK
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .tokens.token cimport Token from .tokens.token cimport Token
from .attrs cimport PROB, LANG, ORTH, TAG, POS from .attrs cimport LANG, ORTH, TAG, POS
from .structs cimport SerializedLexemeC
from .compat import copy_reg, basestring_ from .compat import copy_reg, basestring_
from .errors import Errors from .errors import Errors
@ -22,6 +21,8 @@ from .vectors import Vectors
from ._ml import link_vectors_to_models from ._ml import link_vectors_to_models
from .lookups import Lookups from .lookups import Lookups
from . import util from . import util
from .lang.norm_exceptions import BASE_NORMS
from .lang.lex_attrs import LEX_ATTRS
cdef class Vocab: cdef class Vocab:
@ -32,8 +33,8 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab DOCS: https://spacy.io/api/vocab
""" """
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
strings=tuple(), lookups=None, oov_prob=-20., vectors_name=None, strings=tuple(), lookups=None, lookups_extra=None,
**deprecated_kwargs): oov_prob=-20., vectors_name=None, **deprecated_kwargs):
"""Create the vocabulary. """Create the vocabulary.
lex_attr_getters (dict): A dictionary mapping attribute IDs to lex_attr_getters (dict): A dictionary mapping attribute IDs to
@ -44,6 +45,7 @@ cdef class Vocab:
strings (StringStore): StringStore that maps strings to integers, and strings (StringStore): StringStore that maps strings to integers, and
vice versa. vice versa.
lookups (Lookups): Container for large lookup tables and dictionaries. lookups (Lookups): Container for large lookup tables and dictionaries.
lookups_extra (Lookups): Container for optional lookup tables and dictionaries.
name (unicode): Optional name to identify the vectors table. name (unicode): Optional name to identify the vectors table.
RETURNS (Vocab): The newly constructed object. RETURNS (Vocab): The newly constructed object.
""" """
@ -51,8 +53,12 @@ cdef class Vocab:
tag_map = tag_map if tag_map is not None else {} tag_map = tag_map if tag_map is not None else {}
if lookups in (None, True, False): if lookups in (None, True, False):
lookups = Lookups() lookups = Lookups()
if "lexeme_norm" not in lookups:
lookups.add_table("lexeme_norm")
if lemmatizer in (None, True, False): if lemmatizer in (None, True, False):
lemmatizer = Lemmatizer(lookups) lemmatizer = Lemmatizer(lookups)
if lookups_extra in (None, True, False):
lookups_extra = Lookups()
self.cfg = {'oov_prob': oov_prob} self.cfg = {'oov_prob': oov_prob}
self.mem = Pool() self.mem = Pool()
self._by_orth = PreshMap() self._by_orth = PreshMap()
@ -65,6 +71,7 @@ cdef class Vocab:
self.morphology = Morphology(self.strings, tag_map, lemmatizer) self.morphology = Morphology(self.strings, tag_map, lemmatizer)
self.vectors = Vectors(name=vectors_name) self.vectors = Vectors(name=vectors_name)
self.lookups = lookups self.lookups = lookups
self.lookups_extra = lookups_extra
@property @property
def lang(self): def lang(self):
@ -173,9 +180,7 @@ cdef class Vocab:
value = func(string) value = func(string)
if isinstance(value, unicode): if isinstance(value, unicode):
value = self.strings.add(value) value = self.strings.add(value)
if attr == PROB: if value is not None:
lex.prob = value
elif value is not None:
Lexeme.set_struct_attr(lex, attr, value) Lexeme.set_struct_attr(lex, attr, value)
if not is_oov: if not is_oov:
self._add_lex_to_vocab(lex.orth, lex) self._add_lex_to_vocab(lex.orth, lex)
@ -435,17 +440,16 @@ cdef class Vocab:
path = util.ensure_path(path) path = util.ensure_path(path)
if not path.exists(): if not path.exists():
path.mkdir() path.mkdir()
setters = ["strings", "lexemes", "vectors"] setters = ["strings", "vectors"]
exclude = util.get_serialization_exclude(setters, exclude, kwargs) exclude = util.get_serialization_exclude(setters, exclude, kwargs)
if "strings" not in exclude: if "strings" not in exclude:
self.strings.to_disk(path / "strings.json") self.strings.to_disk(path / "strings.json")
if "lexemes" not in exclude:
with (path / "lexemes.bin").open("wb") as file_:
file_.write(self.lexemes_to_bytes())
if "vectors" not in "exclude" and self.vectors is not None: if "vectors" not in "exclude" and self.vectors is not None:
self.vectors.to_disk(path) self.vectors.to_disk(path)
if "lookups" not in "exclude" and self.lookups is not None: if "lookups" not in "exclude" and self.lookups is not None:
self.lookups.to_disk(path) self.lookups.to_disk(path)
if "lookups_extra" not in "exclude" and self.lookups_extra is not None:
self.lookups_extra.to_disk(path, filename="lookups_extra.bin")
def from_disk(self, path, exclude=tuple(), **kwargs): def from_disk(self, path, exclude=tuple(), **kwargs):
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
@ -458,13 +462,10 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#to_disk DOCS: https://spacy.io/api/vocab#to_disk
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
getters = ["strings", "lexemes", "vectors"] getters = ["strings", "vectors"]
exclude = util.get_serialization_exclude(getters, exclude, kwargs) exclude = util.get_serialization_exclude(getters, exclude, kwargs)
if "strings" not in exclude: if "strings" not in exclude:
self.strings.from_disk(path / "strings.json") # TODO: add exclude? self.strings.from_disk(path / "strings.json") # TODO: add exclude?
if "lexemes" not in exclude:
with (path / "lexemes.bin").open("rb") as file_:
self.lexemes_from_bytes(file_.read())
if "vectors" not in exclude: if "vectors" not in exclude:
if self.vectors is not None: if self.vectors is not None:
self.vectors.from_disk(path, exclude=["strings"]) self.vectors.from_disk(path, exclude=["strings"])
@ -472,6 +473,14 @@ cdef class Vocab:
link_vectors_to_models(self) link_vectors_to_models(self)
if "lookups" not in exclude: if "lookups" not in exclude:
self.lookups.from_disk(path) self.lookups.from_disk(path)
if "lookups_extra" not in exclude:
self.lookups_extra.from_disk(path, filename="lookups_extra.bin")
if "lexeme_norm" in self.lookups:
self.lex_attr_getters[NORM] = util.add_lookups(
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm")
)
self.length = 0
self._by_orth = PreshMap()
return self return self
def to_bytes(self, exclude=tuple(), **kwargs): def to_bytes(self, exclude=tuple(), **kwargs):
@ -490,9 +499,9 @@ cdef class Vocab:
getters = OrderedDict(( getters = OrderedDict((
("strings", lambda: self.strings.to_bytes()), ("strings", lambda: self.strings.to_bytes()),
("lexemes", lambda: self.lexemes_to_bytes()),
("vectors", deserialize_vectors), ("vectors", deserialize_vectors),
("lookups", lambda: self.lookups.to_bytes()) ("lookups", lambda: self.lookups.to_bytes()),
("lookups_extra", lambda: self.lookups_extra.to_bytes())
)) ))
exclude = util.get_serialization_exclude(getters, exclude, kwargs) exclude = util.get_serialization_exclude(getters, exclude, kwargs)
return util.to_bytes(getters, exclude) return util.to_bytes(getters, exclude)
@ -514,99 +523,62 @@ cdef class Vocab:
setters = OrderedDict(( setters = OrderedDict((
("strings", lambda b: self.strings.from_bytes(b)), ("strings", lambda b: self.strings.from_bytes(b)),
("lexemes", lambda b: self.lexemes_from_bytes(b)),
("vectors", lambda b: serialize_vectors(b)), ("vectors", lambda b: serialize_vectors(b)),
("lookups", lambda b: self.lookups.from_bytes(b)) ("lookups", lambda b: self.lookups.from_bytes(b)),
("lookups_extra", lambda b: self.lookups_extra.from_bytes(b))
)) ))
exclude = util.get_serialization_exclude(setters, exclude, kwargs) exclude = util.get_serialization_exclude(setters, exclude, kwargs)
util.from_bytes(bytes_data, setters, exclude) util.from_bytes(bytes_data, setters, exclude)
if "lexeme_norm" in self.lookups:
self.lex_attr_getters[NORM] = util.add_lookups(
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm")
)
self.length = 0
self._by_orth = PreshMap()
if self.vectors.name is not None: if self.vectors.name is not None:
link_vectors_to_models(self) link_vectors_to_models(self)
return self return self
def lexemes_to_bytes(self):
cdef hash_t key
cdef size_t addr
cdef LexemeC* lexeme = NULL
cdef SerializedLexemeC lex_data
cdef int size = 0
for key, addr in self._by_orth.items():
if addr == 0:
continue
size += sizeof(lex_data.data)
byte_string = b"\0" * size
byte_ptr = <unsigned char*>byte_string
cdef int j
cdef int i = 0
for key, addr in self._by_orth.items():
if addr == 0:
continue
lexeme = <LexemeC*>addr
lex_data = Lexeme.c_to_bytes(lexeme)
for j in range(sizeof(lex_data.data)):
byte_ptr[i] = lex_data.data[j]
i += 1
return byte_string
def lexemes_from_bytes(self, bytes bytes_data):
"""Load the binary vocabulary data from the given string."""
cdef LexemeC* lexeme
cdef hash_t key
cdef unicode py_str
cdef int i = 0
cdef int j = 0
cdef SerializedLexemeC lex_data
chunk_size = sizeof(lex_data.data)
cdef void* ptr
cdef unsigned char* bytes_ptr = bytes_data
for i in range(0, len(bytes_data), chunk_size):
lexeme = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
for j in range(sizeof(lex_data.data)):
lex_data.data[j] = bytes_ptr[i+j]
Lexeme.c_from_bytes(lexeme, lex_data)
prev_entry = self._by_orth.get(lexeme.orth)
if prev_entry != NULL:
memcpy(prev_entry, lexeme, sizeof(LexemeC))
continue
ptr = self.strings._map.get(lexeme.orth)
if ptr == NULL:
continue
py_str = self.strings[lexeme.orth]
if self.strings[py_str] != lexeme.orth:
raise ValueError(Errors.E086.format(string=py_str,
orth_id=lexeme.orth,
hash_id=self.strings[py_str]))
self._by_orth.set(lexeme.orth, lexeme)
self.length += 1
def _reset_cache(self, keys, strings): def _reset_cache(self, keys, strings):
# I'm not sure this made sense. Disable it for now. # I'm not sure this made sense. Disable it for now.
raise NotImplementedError raise NotImplementedError
def load_extra_lookups(self, table_name):
if table_name not in self.lookups_extra:
if self.lang + "_extra" in util.registry.lookups:
tables = util.registry.lookups.get(self.lang + "_extra")
for name, filename in tables.items():
if table_name == name:
data = util.load_language_data(filename)
self.lookups_extra.add_table(name, data)
if table_name not in self.lookups_extra:
self.lookups_extra.add_table(table_name)
return self.lookups_extra.get_table(table_name)
def pickle_vocab(vocab): def pickle_vocab(vocab):
sstore = vocab.strings sstore = vocab.strings
vectors = vocab.vectors vectors = vocab.vectors
morph = vocab.morphology morph = vocab.morphology
length = vocab.length
data_dir = vocab.data_dir data_dir = vocab.data_dir
lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters) lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
lexemes_data = vocab.lexemes_to_bytes() lookups = vocab.lookups
lookups_extra = vocab.lookups_extra
return (unpickle_vocab, return (unpickle_vocab,
(sstore, vectors, morph, data_dir, lex_attr_getters, lexemes_data, length)) (sstore, vectors, morph, data_dir, lex_attr_getters, lookups, lookups_extra))
def unpickle_vocab(sstore, vectors, morphology, data_dir, def unpickle_vocab(sstore, vectors, morphology, data_dir,
lex_attr_getters, bytes lexemes_data, int length): lex_attr_getters, lookups, lookups_extra):
cdef Vocab vocab = Vocab() cdef Vocab vocab = Vocab()
vocab.length = length
vocab.vectors = vectors vocab.vectors = vectors
vocab.strings = sstore vocab.strings = sstore
vocab.morphology = morphology vocab.morphology = morphology
vocab.data_dir = data_dir vocab.data_dir = data_dir
vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters) vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
vocab.lexemes_from_bytes(lexemes_data) vocab.lookups = lookups
vocab.length = length vocab.lookups_extra = lookups_extra
return vocab return vocab