mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Reduce stored lexemes data, move feats to lookups (#5238)
* Reduce stored lexemes data, move feats to lookups * Move non-derivable lexemes features (`norm / cluster / prob`) to `spacy-lookups-data` as lookups * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in lookups only * Remove serialization of lexemes data as `vocab/lexemes.bin` * Remove `SerializedLexemeC` * Remove `Lexeme.to_bytes/from_bytes` * Modify normalization exception loading: * Always create `Vocab.lookups` table `lexeme_norm` for normalization exceptions * Load base exceptions from `lang.norm_exceptions`, but load language-specific exceptions from lookups * Set `lex_attr_getter[NORM]` including new lookups table in `BaseDefaults.create_vocab()` and when deserializing `Vocab` * Remove all cached lexemes when deserializing vocab to override existing normalizations with the new normalizations (as a replacement for the previous step that replaced all lexemes data with the deserialized data) * Skip English normalization test Skip English normalization test because the data is now in `spacy-lookups-data`. * Remove norm exceptions Moved to spacy-lookups-data. * Move norm exceptions test to spacy-lookups-data * Load extra lookups from spacy-lookups-data lazily Load extra lookups (currently for cluster and prob) lazily from the entry point `lg_extra` as `Vocab.lookups_extra`. * Skip creating lexeme cache on load To improve model loading times, do not create the full lexeme cache when loading. The lexemes will be created on demand when processing. * Identify numeric values in Lexeme.set_attrs() With the removal of a special case for `PROB`, also identify `float` to avoid trying to convert it with the `StringStore`. * Skip lexeme cache init in from_bytes * Unskip and update lookups tests for python3.6+ * Update vocab pickle to include lookups_extra * Update vocab serialization tests Check strings rather than lexemes since lexemes aren't initialized automatically, account for addition of "_SP". * Re-skip lookups test because of python3.5 * Skip PROB/float values in Lexeme.set_attrs * Convert is_oov from lexeme flag to lex in vectors Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether the lexeme has a vector. Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
parent
a41e28ceba
commit
a5cd203284
|
@ -15,7 +15,7 @@ cdef enum attr_id_t:
|
|||
LIKE_NUM
|
||||
LIKE_EMAIL
|
||||
IS_STOP
|
||||
IS_OOV
|
||||
IS_OOV_DEPRECATED
|
||||
IS_BRACKET
|
||||
IS_QUOTE
|
||||
IS_LEFT_PUNCT
|
||||
|
|
|
@ -16,7 +16,7 @@ IDS = {
|
|||
"LIKE_NUM": LIKE_NUM,
|
||||
"LIKE_EMAIL": LIKE_EMAIL,
|
||||
"IS_STOP": IS_STOP,
|
||||
"IS_OOV": IS_OOV,
|
||||
"IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
|
||||
"IS_BRACKET": IS_BRACKET,
|
||||
"IS_QUOTE": IS_QUOTE,
|
||||
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
|
||||
|
|
|
@ -157,15 +157,11 @@ def create_model(lang, lex_attrs, name=None):
|
|||
nlp = lang_class()
|
||||
for lexeme in nlp.vocab:
|
||||
lexeme.rank = OOV_RANK
|
||||
lex_added = 0
|
||||
for attrs in lex_attrs:
|
||||
if "settings" in attrs:
|
||||
continue
|
||||
lexeme = nlp.vocab[attrs["orth"]]
|
||||
lexeme.set_attrs(**attrs)
|
||||
lexeme.is_oov = False
|
||||
lex_added += 1
|
||||
lex_added += 1
|
||||
if len(nlp.vocab):
|
||||
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
|
||||
else:
|
||||
|
@ -193,8 +189,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
|
|||
if vector_keys is not None:
|
||||
for word in vector_keys:
|
||||
if word not in nlp.vocab:
|
||||
lexeme = nlp.vocab[word]
|
||||
lexeme.is_oov = False
|
||||
nlp.vocab[word]
|
||||
if vectors_data is not None:
|
||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||
if name is None:
|
||||
|
|
|
@ -15,7 +15,6 @@ import random
|
|||
|
||||
from .._ml import create_default_optimizer
|
||||
from ..util import use_gpu as set_gpu
|
||||
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
||||
from ..gold import GoldCorpus
|
||||
from ..compat import path2str
|
||||
from .. import util
|
||||
|
@ -630,15 +629,6 @@ def _create_progress_bar(total):
|
|||
|
||||
def _load_vectors(nlp, vectors):
|
||||
util.load_model(vectors, vocab=nlp.vocab)
|
||||
for lex in nlp.vocab:
|
||||
values = {}
|
||||
for attr, func in nlp.vocab.lex_attr_getters.items():
|
||||
# These attrs are expected to be set by data. Others should
|
||||
# be set by calling the language functions.
|
||||
if attr not in (CLUSTER, PROB, IS_OOV, LANG):
|
||||
values[lex.vocab.strings[attr]] = func(lex.orth_)
|
||||
lex.set_attrs(**values)
|
||||
lex.is_oov = False
|
||||
|
||||
|
||||
def _load_pretrained_tok2vec(nlp, loc):
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
@ -12,17 +11,14 @@ from ..tag_map import TAG_MAP
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
class DanishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "da"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
morph_rules = MORPH_RULES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
|
|
@ -1,527 +0,0 @@
|
|||
# coding: utf8
|
||||
"""
|
||||
Special-case rules for normalizing tokens to improve the model's predictions.
|
||||
For example 'mysterium' vs 'mysterie' and similar.
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# Sources:
|
||||
# 1: https://dsn.dk/retskrivning/om-retskrivningsordbogen/mere-om-retskrivningsordbogen-2012/endrede-stave-og-ordformer/
|
||||
# 2: http://www.tjerry-korrektur.dk/ord-med-flere-stavemaader/
|
||||
|
||||
_exc = {
|
||||
# Alternative spelling
|
||||
"a-kraft-værk": "a-kraftværk", # 1
|
||||
"ålborg": "aalborg", # 2
|
||||
"århus": "aarhus",
|
||||
"accessoirer": "accessoires", # 1
|
||||
"affektert": "affekteret", # 1
|
||||
"afrikander": "afrikaaner", # 1
|
||||
"aftabuere": "aftabuisere", # 1
|
||||
"aftabuering": "aftabuisering", # 1
|
||||
"akvarium": "akvarie", # 1
|
||||
"alenefader": "alenefar", # 1
|
||||
"alenemoder": "alenemor", # 1
|
||||
"alkoholambulatorium": "alkoholambulatorie", # 1
|
||||
"ambulatorium": "ambulatorie", # 1
|
||||
"ananassene": "ananasserne", # 2
|
||||
"anførelsestegn": "anførselstegn", # 1
|
||||
"anseelig": "anselig", # 2
|
||||
"antioxydant": "antioxidant", # 1
|
||||
"artrig": "artsrig", # 1
|
||||
"auditorium": "auditorie", # 1
|
||||
"avocado": "avokado", # 2
|
||||
"bagerst": "bagest", # 2
|
||||
"bagstræv": "bagstræb", # 1
|
||||
"bagstræver": "bagstræber", # 1
|
||||
"bagstræverisk": "bagstræberisk", # 1
|
||||
"balde": "balle", # 2
|
||||
"barselorlov": "barselsorlov", # 1
|
||||
"barselvikar": "barselsvikar", # 1
|
||||
"baskien": "baskerlandet", # 1
|
||||
"bayrisk": "bayersk", # 1
|
||||
"bedstefader": "bedstefar", # 1
|
||||
"bedstemoder": "bedstemor", # 1
|
||||
"behefte": "behæfte", # 1
|
||||
"beheftelse": "behæftelse", # 1
|
||||
"bidragydende": "bidragsydende", # 1
|
||||
"bidragyder": "bidragsyder", # 1
|
||||
"billiondel": "billiontedel", # 1
|
||||
"blaseret": "blasert", # 1
|
||||
"bleskifte": "bleskift", # 1
|
||||
"blodbroder": "blodsbroder", # 2
|
||||
"blyantspidser": "blyantsspidser", # 2
|
||||
"boligministerium": "boligministerie", # 1
|
||||
"borhul": "borehul", # 1
|
||||
"broder": "bror", # 2
|
||||
"buldog": "bulldog", # 2
|
||||
"bådhus": "bådehus", # 1
|
||||
"børnepleje": "barnepleje", # 1
|
||||
"børneseng": "barneseng", # 1
|
||||
"børnestol": "barnestol", # 1
|
||||
"cairo": "kairo", # 1
|
||||
"cambodia": "cambodja", # 1
|
||||
"cambodianer": "cambodjaner", # 1
|
||||
"cambodiansk": "cambodjansk", # 1
|
||||
"camouflage": "kamuflage", # 2
|
||||
"campylobacter": "kampylobakter", # 1
|
||||
"centeret": "centret", # 2
|
||||
"chefskahyt": "chefkahyt", # 1
|
||||
"chefspost": "chefpost", # 1
|
||||
"chefssekretær": "chefsekretær", # 1
|
||||
"chefsstol": "chefstol", # 1
|
||||
"cirkulærskrivelse": "cirkulæreskrivelse", # 1
|
||||
"cognacsglas": "cognacglas", # 1
|
||||
"columnist": "kolumnist", # 1
|
||||
"cricket": "kricket", # 2
|
||||
"dagplejemoder": "dagplejemor", # 1
|
||||
"damaskesdug": "damaskdug", # 1
|
||||
"damp-barn": "dampbarn", # 1
|
||||
"delfinarium": "delfinarie", # 1
|
||||
"dentallaboratorium": "dentallaboratorie", # 1
|
||||
"diaramme": "diasramme", # 1
|
||||
"diaré": "diarré", # 1
|
||||
"dioxyd": "dioxid", # 1
|
||||
"dommedagsprædiken": "dommedagspræken", # 1
|
||||
"donut": "doughnut", # 2
|
||||
"driftmæssig": "driftsmæssig", # 1
|
||||
"driftsikker": "driftssikker", # 1
|
||||
"driftsikring": "driftssikring", # 1
|
||||
"drikkejogurt": "drikkeyoghurt", # 1
|
||||
"drivein": "drive-in", # 1
|
||||
"driveinbiograf": "drive-in-biograf", # 1
|
||||
"drøvel": "drøbel", # 1
|
||||
"dødskriterium": "dødskriterie", # 1
|
||||
"e-mail-adresse": "e-mailadresse", # 1
|
||||
"e-post-adresse": "e-postadresse", # 1
|
||||
"egypten": "ægypten", # 2
|
||||
"ekskommunicere": "ekskommunikere", # 1
|
||||
"eksperimentarium": "eksperimentarie", # 1
|
||||
"elsass": "Alsace", # 1
|
||||
"elsasser": "alsacer", # 1
|
||||
"elsassisk": "alsacisk", # 1
|
||||
"elvetal": "ellevetal", # 1
|
||||
"elvetiden": "ellevetiden", # 1
|
||||
"elveårig": "elleveårig", # 1
|
||||
"elveårs": "elleveårs", # 1
|
||||
"elveårsbarn": "elleveårsbarn", # 1
|
||||
"elvte": "ellevte", # 1
|
||||
"elvtedel": "ellevtedel", # 1
|
||||
"energiministerium": "energiministerie", # 1
|
||||
"erhvervsministerium": "erhvervsministerie", # 1
|
||||
"espaliere": "spaliere", # 2
|
||||
"evangelium": "evangelie", # 1
|
||||
"fagministerium": "fagministerie", # 1
|
||||
"fakse": "faxe", # 1
|
||||
"fangstkvota": "fangstkvote", # 1
|
||||
"fader": "far", # 2
|
||||
"farbroder": "farbror", # 1
|
||||
"farfader": "farfar", # 1
|
||||
"farmoder": "farmor", # 1
|
||||
"federal": "føderal", # 1
|
||||
"federalisering": "føderalisering", # 1
|
||||
"federalisme": "føderalisme", # 1
|
||||
"federalist": "føderalist", # 1
|
||||
"federalistisk": "føderalistisk", # 1
|
||||
"federation": "føderation", # 1
|
||||
"federativ": "føderativ", # 1
|
||||
"fejlbeheftet": "fejlbehæftet", # 1
|
||||
"femetagers": "femetages", # 2
|
||||
"femhundredekroneseddel": "femhundredkroneseddel", # 2
|
||||
"filmpremiere": "filmpræmiere", # 2
|
||||
"finansimperium": "finansimperie", # 1
|
||||
"finansministerium": "finansministerie", # 1
|
||||
"firehjulstræk": "firhjulstræk", # 2
|
||||
"fjernstudium": "fjernstudie", # 1
|
||||
"formalier": "formalia", # 1
|
||||
"formandsskift": "formandsskifte", # 1
|
||||
"fornemst": "fornemmest", # 2
|
||||
"fornuftparti": "fornuftsparti", # 1
|
||||
"fornuftstridig": "fornuftsstridig", # 1
|
||||
"fornuftvæsen": "fornuftsvæsen", # 1
|
||||
"fornuftægteskab": "fornuftsægteskab", # 1
|
||||
"forretningsministerium": "forretningsministerie", # 1
|
||||
"forskningsministerium": "forskningsministerie", # 1
|
||||
"forstudium": "forstudie", # 1
|
||||
"forsvarsministerium": "forsvarsministerie", # 1
|
||||
"frilægge": "fritlægge", # 1
|
||||
"frilæggelse": "fritlæggelse", # 1
|
||||
"frilægning": "fritlægning", # 1
|
||||
"fristille": "fritstille", # 1
|
||||
"fristilling": "fritstilling", # 1
|
||||
"fuldttegnet": "fuldtegnet", # 1
|
||||
"fødestedskriterium": "fødestedskriterie", # 1
|
||||
"fødevareministerium": "fødevareministerie", # 1
|
||||
"følesløs": "følelsesløs", # 1
|
||||
"følgeligt": "følgelig", # 1
|
||||
"førne": "førn", # 1
|
||||
"gearskift": "gearskifte", # 2
|
||||
"gladeligt": "gladelig", # 1
|
||||
"glosehefte": "glosehæfte", # 1
|
||||
"glædeløs": "glædesløs", # 1
|
||||
"gonoré": "gonorré", # 1
|
||||
"grangiveligt": "grangivelig", # 1
|
||||
"grundliggende": "grundlæggende", # 2
|
||||
"grønsag": "grøntsag", # 2
|
||||
"gudbenådet": "gudsbenådet", # 1
|
||||
"gudfader": "gudfar", # 1
|
||||
"gudmoder": "gudmor", # 1
|
||||
"gulvmop": "gulvmoppe", # 1
|
||||
"gymnasium": "gymnasie", # 1
|
||||
"hackning": "hacking", # 1
|
||||
"halvbroder": "halvbror", # 1
|
||||
"halvelvetiden": "halvellevetiden", # 1
|
||||
"handelsgymnasium": "handelsgymnasie", # 1
|
||||
"hefte": "hæfte", # 1
|
||||
"hefteklamme": "hæfteklamme", # 1
|
||||
"heftelse": "hæftelse", # 1
|
||||
"heftemaskine": "hæftemaskine", # 1
|
||||
"heftepistol": "hæftepistol", # 1
|
||||
"hefteplaster": "hæfteplaster", # 1
|
||||
"heftestraf": "hæftestraf", # 1
|
||||
"heftning": "hæftning", # 1
|
||||
"helbroder": "helbror", # 1
|
||||
"hjemmeklasse": "hjemklasse", # 1
|
||||
"hjulspin": "hjulspind", # 1
|
||||
"huggevåben": "hugvåben", # 1
|
||||
"hulmurisolering": "hulmursisolering", # 1
|
||||
"hurtiggående": "hurtigtgående", # 2
|
||||
"hurtigttørrende": "hurtigtørrende", # 2
|
||||
"husmoder": "husmor", # 1
|
||||
"hydroxyd": "hydroxid", # 1
|
||||
"håndmikser": "håndmixer", # 1
|
||||
"højtaler": "højttaler", # 2
|
||||
"hønemoder": "hønemor", # 1
|
||||
"ide": "idé", # 2
|
||||
"imperium": "imperie", # 1
|
||||
"imponerthed": "imponerethed", # 1
|
||||
"inbox": "indboks", # 2
|
||||
"indenrigsministerium": "indenrigsministerie", # 1
|
||||
"indhefte": "indhæfte", # 1
|
||||
"indheftning": "indhæftning", # 1
|
||||
"indicium": "indicie", # 1
|
||||
"indkassere": "inkassere", # 2
|
||||
"iota": "jota", # 1
|
||||
"jobskift": "jobskifte", # 1
|
||||
"jogurt": "yoghurt", # 1
|
||||
"jukeboks": "jukebox", # 1
|
||||
"justitsministerium": "justitsministerie", # 1
|
||||
"kalorifere": "kalorifer", # 1
|
||||
"kandidatstipendium": "kandidatstipendie", # 1
|
||||
"kannevas": "kanvas", # 1
|
||||
"kaperssauce": "kaperssovs", # 1
|
||||
"kigge": "kikke", # 2
|
||||
"kirkeministerium": "kirkeministerie", # 1
|
||||
"klapmydse": "klapmyds", # 1
|
||||
"klimakterium": "klimakterie", # 1
|
||||
"klogeligt": "klogelig", # 1
|
||||
"knivblad": "knivsblad", # 1
|
||||
"kollegaer": "kolleger", # 2
|
||||
"kollegium": "kollegie", # 1
|
||||
"kollegiehefte": "kollegiehæfte", # 1
|
||||
"kollokviumx": "kollokvium", # 1
|
||||
"kommissorium": "kommissorie", # 1
|
||||
"kompendium": "kompendie", # 1
|
||||
"komplicerthed": "komplicerethed", # 1
|
||||
"konfederation": "konføderation", # 1
|
||||
"konfedereret": "konfødereret", # 1
|
||||
"konferensstudium": "konferensstudie", # 1
|
||||
"konservatorium": "konservatorie", # 1
|
||||
"konsulere": "konsultere", # 1
|
||||
"kradsbørstig": "krasbørstig", # 2
|
||||
"kravsspecifikation": "kravspecifikation", # 1
|
||||
"krematorium": "krematorie", # 1
|
||||
"krep": "crepe", # 1
|
||||
"krepnylon": "crepenylon", # 1
|
||||
"kreppapir": "crepepapir", # 1
|
||||
"kricket": "cricket", # 2
|
||||
"kriterium": "kriterie", # 1
|
||||
"kroat": "kroater", # 2
|
||||
"kroki": "croquis", # 1
|
||||
"kronprinsepar": "kronprinspar", # 2
|
||||
"kropdoven": "kropsdoven", # 1
|
||||
"kroplus": "kropslus", # 1
|
||||
"krøllefedt": "krølfedt", # 1
|
||||
"kulturministerium": "kulturministerie", # 1
|
||||
"kuponhefte": "kuponhæfte", # 1
|
||||
"kvota": "kvote", # 1
|
||||
"kvotaordning": "kvoteordning", # 1
|
||||
"laboratorium": "laboratorie", # 1
|
||||
"laksfarve": "laksefarve", # 1
|
||||
"laksfarvet": "laksefarvet", # 1
|
||||
"laksrød": "lakserød", # 1
|
||||
"laksyngel": "lakseyngel", # 1
|
||||
"laksørred": "lakseørred", # 1
|
||||
"landbrugsministerium": "landbrugsministerie", # 1
|
||||
"landskampstemning": "landskampsstemning", # 1
|
||||
"langust": "languster", # 1
|
||||
"lappegrejer": "lappegrej", # 1
|
||||
"lavløn": "lavtløn", # 1
|
||||
"lillebroder": "lillebror", # 1
|
||||
"linear": "lineær", # 1
|
||||
"loftlampe": "loftslampe", # 2
|
||||
"log-in": "login", # 1
|
||||
"login": "log-in", # 2
|
||||
"lovmedholdig": "lovmedholdelig", # 1
|
||||
"ludder": "luder", # 2
|
||||
"lysholder": "lyseholder", # 1
|
||||
"lægeskifte": "lægeskift", # 1
|
||||
"lærvillig": "lærevillig", # 1
|
||||
"løgsauce": "løgsovs", # 1
|
||||
"madmoder": "madmor", # 1
|
||||
"majonæse": "mayonnaise", # 1
|
||||
"mareridtagtig": "mareridtsagtig", # 1
|
||||
"margen": "margin", # 2
|
||||
"martyrium": "martyrie", # 1
|
||||
"mellemstatlig": "mellemstatslig", # 1
|
||||
"menneskene": "menneskerne", # 2
|
||||
"metropolis": "metropol", # 1
|
||||
"miks": "mix", # 1
|
||||
"mikse": "mixe", # 1
|
||||
"miksepult": "mixerpult", # 1
|
||||
"mikser": "mixer", # 1
|
||||
"mikserpult": "mixerpult", # 1
|
||||
"mikslån": "mixlån", # 1
|
||||
"miksning": "mixning", # 1
|
||||
"miljøministerium": "miljøministerie", # 1
|
||||
"milliarddel": "milliardtedel", # 1
|
||||
"milliondel": "milliontedel", # 1
|
||||
"ministerium": "ministerie", # 1
|
||||
"mop": "moppe", # 1
|
||||
"moder": "mor", # 2
|
||||
"moratorium": "moratorie", # 1
|
||||
"morbroder": "morbror", # 1
|
||||
"morfader": "morfar", # 1
|
||||
"mormoder": "mormor", # 1
|
||||
"musikkonservatorium": "musikkonservatorie", # 1
|
||||
"muslingskal": "muslingeskal", # 1
|
||||
"mysterium": "mysterie", # 1
|
||||
"naturalieydelse": "naturalydelse", # 1
|
||||
"naturalieøkonomi": "naturaløkonomi", # 1
|
||||
"navnebroder": "navnebror", # 1
|
||||
"nerium": "nerie", # 1
|
||||
"nådeløs": "nådesløs", # 1
|
||||
"nærforestående": "nærtforestående", # 1
|
||||
"nærstående": "nærtstående", # 1
|
||||
"observatorium": "observatorie", # 1
|
||||
"oldefader": "oldefar", # 1
|
||||
"oldemoder": "oldemor", # 1
|
||||
"opgraduere": "opgradere", # 1
|
||||
"opgraduering": "opgradering", # 1
|
||||
"oratorium": "oratorie", # 1
|
||||
"overbookning": "overbooking", # 1
|
||||
"overpræsidium": "overpræsidie", # 1
|
||||
"overstatlig": "overstatslig", # 1
|
||||
"oxyd": "oxid", # 1
|
||||
"oxydere": "oxidere", # 1
|
||||
"oxydering": "oxidering", # 1
|
||||
"pakkenellike": "pakkenelliker", # 1
|
||||
"papirtynd": "papirstynd", # 1
|
||||
"pastoralseminarium": "pastoralseminarie", # 1
|
||||
"peanutsene": "peanuttene", # 2
|
||||
"penalhus": "pennalhus", # 2
|
||||
"pensakrav": "pensumkrav", # 1
|
||||
"pepperoni": "peperoni", # 1
|
||||
"peruaner": "peruvianer", # 1
|
||||
"petrole": "petrol", # 1
|
||||
"piltast": "piletast", # 1
|
||||
"piltaste": "piletast", # 1
|
||||
"planetarium": "planetarie", # 1
|
||||
"plasteret": "plastret", # 2
|
||||
"plastic": "plastik", # 2
|
||||
"play-off-kamp": "playoffkamp", # 1
|
||||
"plejefader": "plejefar", # 1
|
||||
"plejemoder": "plejemor", # 1
|
||||
"podium": "podie", # 2
|
||||
"praha": "prag", # 2
|
||||
"preciøs": "pretiøs", # 2
|
||||
"privilegium": "privilegie", # 1
|
||||
"progredere": "progrediere", # 1
|
||||
"præsidium": "præsidie", # 1
|
||||
"psykodelisk": "psykedelisk", # 1
|
||||
"pudsegrejer": "pudsegrej", # 1
|
||||
"referensgruppe": "referencegruppe", # 1
|
||||
"referensramme": "referenceramme", # 1
|
||||
"refugium": "refugie", # 1
|
||||
"registeret": "registret", # 2
|
||||
"remedium": "remedie", # 1
|
||||
"remiks": "remix", # 1
|
||||
"reservert": "reserveret", # 1
|
||||
"ressortministerium": "ressortministerie", # 1
|
||||
"ressource": "resurse", # 2
|
||||
"resætte": "resette", # 1
|
||||
"rettelig": "retteligt", # 1
|
||||
"rettetaste": "rettetast", # 1
|
||||
"returtaste": "returtast", # 1
|
||||
"risici": "risikoer", # 2
|
||||
"roll-on": "rollon", # 1
|
||||
"rollehefte": "rollehæfte", # 1
|
||||
"rostbøf": "roastbeef", # 1
|
||||
"rygsæksturist": "rygsækturist", # 1
|
||||
"rødstjært": "rødstjert", # 1
|
||||
"saddel": "sadel", # 2
|
||||
"samaritan": "samaritaner", # 2
|
||||
"sanatorium": "sanatorie", # 1
|
||||
"sauce": "sovs", # 1
|
||||
"scanning": "skanning", # 2
|
||||
"sceneskifte": "sceneskift", # 1
|
||||
"scilla": "skilla", # 1
|
||||
"sejflydende": "sejtflydende", # 1
|
||||
"selvstudium": "selvstudie", # 1
|
||||
"seminarium": "seminarie", # 1
|
||||
"sennepssauce": "sennepssovs ", # 1
|
||||
"servitutbeheftet": "servitutbehæftet", # 1
|
||||
"sit-in": "sitin", # 1
|
||||
"skatteministerium": "skatteministerie", # 1
|
||||
"skifer": "skiffer", # 2
|
||||
"skyldsfølelse": "skyldfølelse", # 1
|
||||
"skysauce": "skysovs", # 1
|
||||
"sladdertaske": "sladretaske", # 2
|
||||
"sladdervorn": "sladrevorn", # 2
|
||||
"slagsbroder": "slagsbror", # 1
|
||||
"slettetaste": "slettetast", # 1
|
||||
"smørsauce": "smørsovs", # 1
|
||||
"snitsel": "schnitzel", # 1
|
||||
"snobbeeffekt": "snobeffekt", # 2
|
||||
"socialministerium": "socialministerie", # 1
|
||||
"solarium": "solarie", # 1
|
||||
"soldebroder": "soldebror", # 1
|
||||
"spagetti": "spaghetti", # 1
|
||||
"spagettistrop": "spaghettistrop", # 1
|
||||
"spagettiwestern": "spaghettiwestern", # 1
|
||||
"spin-off": "spinoff", # 1
|
||||
"spinnefiskeri": "spindefiskeri", # 1
|
||||
"spolorm": "spoleorm", # 1
|
||||
"sproglaboratorium": "sproglaboratorie", # 1
|
||||
"spækbræt": "spækkebræt", # 2
|
||||
"stand-in": "standin", # 1
|
||||
"stand-up-comedy": "standupcomedy", # 1
|
||||
"stand-up-komiker": "standupkomiker", # 1
|
||||
"statsministerium": "statsministerie", # 1
|
||||
"stedbroder": "stedbror", # 1
|
||||
"stedfader": "stedfar", # 1
|
||||
"stedmoder": "stedmor", # 1
|
||||
"stilehefte": "stilehæfte", # 1
|
||||
"stipendium": "stipendie", # 1
|
||||
"stjært": "stjert", # 1
|
||||
"stjærthage": "stjerthage", # 1
|
||||
"storebroder": "storebror", # 1
|
||||
"stortå": "storetå", # 1
|
||||
"strabads": "strabadser", # 1
|
||||
"strømlinjet": "strømlinet", # 1
|
||||
"studium": "studie", # 1
|
||||
"stænkelap": "stænklap", # 1
|
||||
"sundhedsministerium": "sundhedsministerie", # 1
|
||||
"suppositorium": "suppositorie", # 1
|
||||
"svejts": "schweiz", # 1
|
||||
"svejtser": "schweizer", # 1
|
||||
"svejtserfranc": "schweizerfranc", # 1
|
||||
"svejtserost": "schweizerost", # 1
|
||||
"svejtsisk": "schweizisk", # 1
|
||||
"svigerfader": "svigerfar", # 1
|
||||
"svigermoder": "svigermor", # 1
|
||||
"svirebroder": "svirebror", # 1
|
||||
"symposium": "symposie", # 1
|
||||
"sælarium": "sælarie", # 1
|
||||
"søreme": "sørme", # 2
|
||||
"søterritorium": "søterritorie", # 1
|
||||
"t-bone-steak": "t-bonesteak", # 1
|
||||
"tabgivende": "tabsgivende", # 1
|
||||
"tabuere": "tabuisere", # 1
|
||||
"tabuering": "tabuisering", # 1
|
||||
"tackle": "takle", # 2
|
||||
"tackling": "takling", # 2
|
||||
"taifun": "tyfon", # 1
|
||||
"take-off": "takeoff", # 1
|
||||
"taknemlig": "taknemmelig", # 2
|
||||
"talehørelærer": "tale-høre-lærer", # 1
|
||||
"talehøreundervisning": "tale-høre-undervisning", # 1
|
||||
"tandstik": "tandstikker", # 1
|
||||
"tao": "dao", # 1
|
||||
"taoisme": "daoisme", # 1
|
||||
"taoist": "daoist", # 1
|
||||
"taoistisk": "daoistisk", # 1
|
||||
"taverne": "taverna", # 1
|
||||
"teateret": "teatret", # 2
|
||||
"tekno": "techno", # 1
|
||||
"temposkifte": "temposkift", # 1
|
||||
"terrarium": "terrarie", # 1
|
||||
"territorium": "territorie", # 1
|
||||
"tesis": "tese", # 1
|
||||
"tidsstudium": "tidsstudie", # 1
|
||||
"tipoldefader": "tipoldefar", # 1
|
||||
"tipoldemoder": "tipoldemor", # 1
|
||||
"tomatsauce": "tomatsovs", # 1
|
||||
"tonart": "toneart", # 1
|
||||
"trafikministerium": "trafikministerie", # 1
|
||||
"tredve": "tredive", # 1
|
||||
"tredver": "trediver", # 1
|
||||
"tredveårig": "trediveårig", # 1
|
||||
"tredveårs": "trediveårs", # 1
|
||||
"tredveårsfødselsdag": "trediveårsfødselsdag", # 1
|
||||
"tredvte": "tredivte", # 1
|
||||
"tredvtedel": "tredivtedel", # 1
|
||||
"troldunge": "troldeunge", # 1
|
||||
"trommestikke": "trommestik", # 1
|
||||
"trubadur": "troubadour", # 2
|
||||
"trøstepræmie": "trøstpræmie", # 2
|
||||
"tummerum": "trummerum", # 1
|
||||
"tumultuarisk": "tumultarisk", # 1
|
||||
"tunghørighed": "tunghørhed", # 1
|
||||
"tus": "tusch", # 2
|
||||
"tusind": "tusinde", # 2
|
||||
"tvillingbroder": "tvillingebror", # 1
|
||||
"tvillingbror": "tvillingebror", # 1
|
||||
"tvillingebroder": "tvillingebror", # 1
|
||||
"ubeheftet": "ubehæftet", # 1
|
||||
"udenrigsministerium": "udenrigsministerie", # 1
|
||||
"udhulning": "udhuling", # 1
|
||||
"udslaggivende": "udslagsgivende", # 1
|
||||
"udspekulert": "udspekuleret", # 1
|
||||
"udviklingsministerium": "udviklingsministerie", # 1
|
||||
"uforpligtigende": "uforpligtende", # 1
|
||||
"uheldvarslende": "uheldsvarslende", # 1
|
||||
"uimponerthed": "uimponerethed", # 1
|
||||
"undervisningsministerium": "undervisningsministerie", # 1
|
||||
"unægtelig": "unægteligt", # 1
|
||||
"urinale": "urinal", # 1
|
||||
"uvederheftig": "uvederhæftig", # 1
|
||||
"vabel": "vable", # 2
|
||||
"vadi": "wadi", # 1
|
||||
"vaklevorn": "vakkelvorn", # 1
|
||||
"vanadin": "vanadium", # 1
|
||||
"vaselin": "vaseline", # 1
|
||||
"vederheftig": "vederhæftig", # 1
|
||||
"vedhefte": "vedhæfte", # 1
|
||||
"velar": "velær", # 1
|
||||
"videndeling": "vidensdeling", # 2
|
||||
"vinkelanførelsestegn": "vinkelanførselstegn", # 1
|
||||
"vipstjært": "vipstjert", # 1
|
||||
"vismut": "bismut", # 1
|
||||
"visvas": "vissevasse", # 1
|
||||
"voksværk": "vokseværk", # 1
|
||||
"værtdyr": "værtsdyr", # 1
|
||||
"værtplante": "værtsplante", # 1
|
||||
"wienersnitsel": "wienerschnitzel", # 1
|
||||
"yderliggående": "yderligtgående", # 2
|
||||
"zombi": "zombie", # 1
|
||||
"ægbakke": "æggebakke", # 1
|
||||
"ægformet": "æggeformet", # 1
|
||||
"ægleder": "æggeleder", # 1
|
||||
"ækvilibrist": "ekvilibrist", # 2
|
||||
"æselsøre": "æseløre", # 1
|
||||
"øjehule": "øjenhule", # 1
|
||||
"øjelåg": "øjenlåg", # 1
|
||||
"øjeåbner": "øjenåbner", # 1
|
||||
"økonomiministerium": "økonomiministerie", # 1
|
||||
"ørenring": "ørering", # 2
|
||||
"øvehefte": "øvehæfte", # 1
|
||||
}
|
||||
|
||||
|
||||
NORM_EXCEPTIONS = {}
|
||||
|
||||
for string, norm in _exc.items():
|
||||
NORM_EXCEPTIONS[string] = norm
|
||||
NORM_EXCEPTIONS[string.title()] = norm
|
|
@ -2,7 +2,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .tag_map import TAG_MAP
|
||||
|
@ -10,18 +9,14 @@ from .stop_words import STOP_WORDS
|
|||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
class GermanDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "de"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
|
|
@ -1,16 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# Here we only want to include the absolute most common words. Otherwise,
|
||||
# this list would get impossibly long for German – especially considering the
|
||||
# old vs. new spelling rules, and all possible cases.
|
||||
|
||||
|
||||
_exc = {"daß": "dass"}
|
||||
|
||||
|
||||
NORM_EXCEPTIONS = {}
|
||||
|
||||
for string, norm in _exc.items():
|
||||
NORM_EXCEPTIONS[string] = norm
|
||||
NORM_EXCEPTIONS[string.title()] = norm
|
|
@ -10,21 +10,16 @@ from .lemmatizer import GreekLemmatizer
|
|||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
class GreekDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "el"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -2,7 +2,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
@ -10,10 +9,9 @@ from .morph_rules import MORPH_RULES
|
|||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
def _return_en(_):
|
||||
|
@ -24,9 +22,6 @@ class EnglishDefaults(Language.Defaults):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = _return_en
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -4,25 +4,20 @@ from __future__ import unicode_literals
|
|||
from .stop_words import STOP_WORDS
|
||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .tag_map import TAG_MAP
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
class IndonesianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "id"
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
|
|
|
@ -1,532 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# Daftar kosakata yang sering salah dieja
|
||||
# https://id.wikipedia.org/wiki/Wikipedia:Daftar_kosakata_bahasa_Indonesia_yang_sering_salah_dieja
|
||||
_exc = {
|
||||
# Slang and abbreviations
|
||||
"silahkan": "silakan",
|
||||
"yg": "yang",
|
||||
"kalo": "kalau",
|
||||
"cawu": "caturwulan",
|
||||
"ok": "oke",
|
||||
"gak": "tidak",
|
||||
"enggak": "tidak",
|
||||
"nggak": "tidak",
|
||||
"ndak": "tidak",
|
||||
"ngga": "tidak",
|
||||
"dgn": "dengan",
|
||||
"tdk": "tidak",
|
||||
"jg": "juga",
|
||||
"klo": "kalau",
|
||||
"denger": "dengar",
|
||||
"pinter": "pintar",
|
||||
"krn": "karena",
|
||||
"nemuin": "menemukan",
|
||||
"jgn": "jangan",
|
||||
"udah": "sudah",
|
||||
"sy": "saya",
|
||||
"udh": "sudah",
|
||||
"dapetin": "mendapatkan",
|
||||
"ngelakuin": "melakukan",
|
||||
"ngebuat": "membuat",
|
||||
"membikin": "membuat",
|
||||
"bikin": "buat",
|
||||
# Daftar kosakata yang sering salah dieja
|
||||
"malpraktik": "malapraktik",
|
||||
"malfungsi": "malafungsi",
|
||||
"malserap": "malaserap",
|
||||
"maladaptasi": "malaadaptasi",
|
||||
"malsuai": "malasuai",
|
||||
"maldistribusi": "maladistribusi",
|
||||
"malgizi": "malagizi",
|
||||
"malsikap": "malasikap",
|
||||
"memperhatikan": "memerhatikan",
|
||||
"akte": "akta",
|
||||
"cemilan": "camilan",
|
||||
"esei": "esai",
|
||||
"frase": "frasa",
|
||||
"kafeteria": "kafetaria",
|
||||
"ketapel": "katapel",
|
||||
"kenderaan": "kendaraan",
|
||||
"menejemen": "manajemen",
|
||||
"menejer": "manajer",
|
||||
"mesjid": "masjid",
|
||||
"rebo": "rabu",
|
||||
"seksama": "saksama",
|
||||
"senggama": "sanggama",
|
||||
"sekedar": "sekadar",
|
||||
"seprei": "seprai",
|
||||
"semedi": "semadi",
|
||||
"samadi": "semadi",
|
||||
"amandemen": "amendemen",
|
||||
"algoritma": "algoritme",
|
||||
"aritmatika": "aritmetika",
|
||||
"metoda": "metode",
|
||||
"materai": "meterai",
|
||||
"meterei": "meterai",
|
||||
"kalendar": "kalender",
|
||||
"kadaluwarsa": "kedaluwarsa",
|
||||
"katagori": "kategori",
|
||||
"parlamen": "parlemen",
|
||||
"sekular": "sekuler",
|
||||
"selular": "seluler",
|
||||
"sirkular": "sirkuler",
|
||||
"survai": "survei",
|
||||
"survey": "survei",
|
||||
"aktuil": "aktual",
|
||||
"formil": "formal",
|
||||
"trotoir": "trotoar",
|
||||
"komersiil": "komersial",
|
||||
"komersil": "komersial",
|
||||
"tradisionil": "tradisionial",
|
||||
"orisinil": "orisinal",
|
||||
"orijinil": "orisinal",
|
||||
"afdol": "afdal",
|
||||
"antri": "antre",
|
||||
"apotik": "apotek",
|
||||
"atlit": "atlet",
|
||||
"atmosfir": "atmosfer",
|
||||
"cidera": "cedera",
|
||||
"cendikiawan": "cendekiawan",
|
||||
"cepet": "cepat",
|
||||
"cinderamata": "cenderamata",
|
||||
"debet": "debit",
|
||||
"difinisi": "definisi",
|
||||
"dekrit": "dekret",
|
||||
"disain": "desain",
|
||||
"diskripsi": "deskripsi",
|
||||
"diskotik": "diskotek",
|
||||
"eksim": "eksem",
|
||||
"exim": "eksem",
|
||||
"faidah": "faedah",
|
||||
"ekstrim": "ekstrem",
|
||||
"ekstrimis": "ekstremis",
|
||||
"komplit": "komplet",
|
||||
"konkrit": "konkret",
|
||||
"kongkrit": "konkret",
|
||||
"kongkret": "konkret",
|
||||
"kridit": "kredit",
|
||||
"musium": "museum",
|
||||
"pinalti": "penalti",
|
||||
"piranti": "peranti",
|
||||
"pinsil": "pensil",
|
||||
"personil": "personel",
|
||||
"sistim": "sistem",
|
||||
"teoritis": "teoretis",
|
||||
"vidio": "video",
|
||||
"cengkeh": "cengkih",
|
||||
"desertasi": "disertasi",
|
||||
"hakekat": "hakikat",
|
||||
"intelejen": "intelijen",
|
||||
"kaedah": "kaidah",
|
||||
"kempes": "kempis",
|
||||
"kementrian": "kementerian",
|
||||
"ledeng": "leding",
|
||||
"nasehat": "nasihat",
|
||||
"penasehat": "penasihat",
|
||||
"praktek": "praktik",
|
||||
"praktekum": "praktikum",
|
||||
"resiko": "risiko",
|
||||
"retsleting": "ritsleting",
|
||||
"senen": "senin",
|
||||
"amuba": "ameba",
|
||||
"punggawa": "penggawa",
|
||||
"surban": "serban",
|
||||
"nomer": "nomor",
|
||||
"sorban": "serban",
|
||||
"bis": "bus",
|
||||
"agribisnis": "agrobisnis",
|
||||
"kantung": "kantong",
|
||||
"khutbah": "khotbah",
|
||||
"mandur": "mandor",
|
||||
"rubuh": "roboh",
|
||||
"pastur": "pastor",
|
||||
"supir": "sopir",
|
||||
"goncang": "guncang",
|
||||
"goa": "gua",
|
||||
"kaos": "kaus",
|
||||
"kokoh": "kukuh",
|
||||
"komulatif": "kumulatif",
|
||||
"kolomnis": "kolumnis",
|
||||
"korma": "kurma",
|
||||
"lobang": "lubang",
|
||||
"limo": "limusin",
|
||||
"limosin": "limusin",
|
||||
"mangkok": "mangkuk",
|
||||
"saos": "saus",
|
||||
"sop": "sup",
|
||||
"sorga": "surga",
|
||||
"tegor": "tegur",
|
||||
"telor": "telur",
|
||||
"obrak-abrik": "ubrak-abrik",
|
||||
"ekwivalen": "ekuivalen",
|
||||
"frekwensi": "frekuensi",
|
||||
"konsekwensi": "konsekuensi",
|
||||
"kwadran": "kuadran",
|
||||
"kwadrat": "kuadrat",
|
||||
"kwalifikasi": "kualifikasi",
|
||||
"kwalitas": "kualitas",
|
||||
"kwalitet": "kualitas",
|
||||
"kwalitatif": "kualitatif",
|
||||
"kwantitas": "kuantitas",
|
||||
"kwantitatif": "kuantitatif",
|
||||
"kwantum": "kuantum",
|
||||
"kwartal": "kuartal",
|
||||
"kwintal": "kuintal",
|
||||
"kwitansi": "kuitansi",
|
||||
"kwatir": "khawatir",
|
||||
"kuatir": "khawatir",
|
||||
"jadual": "jadwal",
|
||||
"hirarki": "hierarki",
|
||||
"karir": "karier",
|
||||
"aktip": "aktif",
|
||||
"daptar": "daftar",
|
||||
"efektip": "efektif",
|
||||
"epektif": "efektif",
|
||||
"epektip": "efektif",
|
||||
"Pebruari": "Februari",
|
||||
"pisik": "fisik",
|
||||
"pondasi": "fondasi",
|
||||
"photo": "foto",
|
||||
"photokopi": "fotokopi",
|
||||
"hapal": "hafal",
|
||||
"insap": "insaf",
|
||||
"insyaf": "insaf",
|
||||
"konperensi": "konferensi",
|
||||
"kreatip": "kreatif",
|
||||
"kreativ": "kreatif",
|
||||
"maap": "maaf",
|
||||
"napsu": "nafsu",
|
||||
"negatip": "negatif",
|
||||
"negativ": "negatif",
|
||||
"objektip": "objektif",
|
||||
"obyektip": "objektif",
|
||||
"obyektif": "objektif",
|
||||
"pasip": "pasif",
|
||||
"pasiv": "pasif",
|
||||
"positip": "positif",
|
||||
"positiv": "positif",
|
||||
"produktip": "produktif",
|
||||
"produktiv": "produktif",
|
||||
"sarap": "saraf",
|
||||
"sertipikat": "sertifikat",
|
||||
"subjektip": "subjektif",
|
||||
"subyektip": "subjektif",
|
||||
"subyektif": "subjektif",
|
||||
"tarip": "tarif",
|
||||
"transitip": "transitif",
|
||||
"transitiv": "transitif",
|
||||
"faham": "paham",
|
||||
"fikir": "pikir",
|
||||
"berfikir": "berpikir",
|
||||
"telefon": "telepon",
|
||||
"telfon": "telepon",
|
||||
"telpon": "telepon",
|
||||
"tilpon": "telepon",
|
||||
"nafas": "napas",
|
||||
"bernafas": "bernapas",
|
||||
"pernafasan": "pernapasan",
|
||||
"vermak": "permak",
|
||||
"vulpen": "pulpen",
|
||||
"aktifis": "aktivis",
|
||||
"konfeksi": "konveksi",
|
||||
"motifasi": "motivasi",
|
||||
"Nopember": "November",
|
||||
"propinsi": "provinsi",
|
||||
"babtis": "baptis",
|
||||
"jerembab": "jerembap",
|
||||
"lembab": "lembap",
|
||||
"sembab": "sembap",
|
||||
"saptu": "sabtu",
|
||||
"tekat": "tekad",
|
||||
"bejad": "bejat",
|
||||
"nekad": "nekat",
|
||||
"otoped": "otopet",
|
||||
"skuad": "skuat",
|
||||
"jenius": "genius",
|
||||
"marjin": "margin",
|
||||
"marjinal": "marginal",
|
||||
"obyek": "objek",
|
||||
"subyek": "subjek",
|
||||
"projek": "proyek",
|
||||
"azas": "asas",
|
||||
"ijasah": "ijazah",
|
||||
"jenasah": "jenazah",
|
||||
"plasa": "plaza",
|
||||
"bathin": "batin",
|
||||
"Katholik": "Katolik",
|
||||
"orthografi": "ortografi",
|
||||
"pathogen": "patogen",
|
||||
"theologi": "teologi",
|
||||
"ijin": "izin",
|
||||
"rejeki": "rezeki",
|
||||
"rejim": "rezim",
|
||||
"jaman": "zaman",
|
||||
"jamrud": "zamrud",
|
||||
"jinah": "zina",
|
||||
"perjinahan": "perzinaan",
|
||||
"anugrah": "anugerah",
|
||||
"cendrawasih": "cenderawasih",
|
||||
"jendral": "jenderal",
|
||||
"kripik": "keripik",
|
||||
"krupuk": "kerupuk",
|
||||
"ksatria": "kesatria",
|
||||
"mentri": "menteri",
|
||||
"negri": "negeri",
|
||||
"Prancis": "Perancis",
|
||||
"sebrang": "seberang",
|
||||
"menyebrang": "menyeberang",
|
||||
"Sumatra": "Sumatera",
|
||||
"trampil": "terampil",
|
||||
"isteri": "istri",
|
||||
"justeru": "justru",
|
||||
"perajurit": "prajurit",
|
||||
"putera": "putra",
|
||||
"puteri": "putri",
|
||||
"samudera": "samudra",
|
||||
"sastera": "sastra",
|
||||
"sutera": "sutra",
|
||||
"terompet": "trompet",
|
||||
"iklas": "ikhlas",
|
||||
"iktisar": "ikhtisar",
|
||||
"kafilah": "khafilah",
|
||||
"kawatir": "khawatir",
|
||||
"kotbah": "khotbah",
|
||||
"kusyuk": "khusyuk",
|
||||
"makluk": "makhluk",
|
||||
"mahluk": "makhluk",
|
||||
"mahkluk": "makhluk",
|
||||
"nahkoda": "nakhoda",
|
||||
"nakoda": "nakhoda",
|
||||
"tahta": "takhta",
|
||||
"takhyul": "takhayul",
|
||||
"tahyul": "takhayul",
|
||||
"tahayul": "takhayul",
|
||||
"akhli": "ahli",
|
||||
"anarkhi": "anarki",
|
||||
"kharisma": "karisma",
|
||||
"kharismatik": "karismatik",
|
||||
"mahsud": "maksud",
|
||||
"makhsud": "maksud",
|
||||
"rakhmat": "rahmat",
|
||||
"tekhnik": "teknik",
|
||||
"tehnik": "teknik",
|
||||
"tehnologi": "teknologi",
|
||||
"ikhwal": "ihwal",
|
||||
"expor": "ekspor",
|
||||
"extra": "ekstra",
|
||||
"komplex": "komplek",
|
||||
"sex": "seks",
|
||||
"taxi": "taksi",
|
||||
"extasi": "ekstasi",
|
||||
"syaraf": "saraf",
|
||||
"syurga": "surga",
|
||||
"mashur": "masyhur",
|
||||
"masyur": "masyhur",
|
||||
"mahsyur": "masyhur",
|
||||
"mashyur": "masyhur",
|
||||
"muadzin": "muazin",
|
||||
"adzan": "azan",
|
||||
"ustadz": "ustaz",
|
||||
"ustad": "ustaz",
|
||||
"ustadzah": "ustaz",
|
||||
"dzikir": "zikir",
|
||||
"dzuhur": "zuhur",
|
||||
"dhuhur": "zuhur",
|
||||
"zhuhur": "zuhur",
|
||||
"analisa": "analisis",
|
||||
"diagnosa": "diagnosis",
|
||||
"hipotesa": "hipotesis",
|
||||
"sintesa": "sintesis",
|
||||
"aktiviti": "aktivitas",
|
||||
"aktifitas": "aktivitas",
|
||||
"efektifitas": "efektivitas",
|
||||
"komuniti": "komunitas",
|
||||
"kreatifitas": "kreativitas",
|
||||
"produktifitas": "produktivitas",
|
||||
"realiti": "realitas",
|
||||
"realita": "realitas",
|
||||
"selebriti": "selebritas",
|
||||
"spotifitas": "sportivitas",
|
||||
"universiti": "universitas",
|
||||
"utiliti": "utilitas",
|
||||
"validiti": "validitas",
|
||||
"dilokalisir": "dilokalisasi",
|
||||
"didramatisir": "didramatisasi",
|
||||
"dipolitisir": "dipolitisasi",
|
||||
"dinetralisir": "dinetralisasi",
|
||||
"dikonfrontir": "dikonfrontasi",
|
||||
"mendominir": "mendominasi",
|
||||
"koordinir": "koordinasi",
|
||||
"proklamir": "proklamasi",
|
||||
"terorganisir": "terorganisasi",
|
||||
"terealisir": "terealisasi",
|
||||
"robah": "ubah",
|
||||
"dirubah": "diubah",
|
||||
"merubah": "mengubah",
|
||||
"terlanjur": "telanjur",
|
||||
"terlantar": "telantar",
|
||||
"penglepasan": "pelepasan",
|
||||
"pelihatan": "penglihatan",
|
||||
"pemukiman": "permukiman",
|
||||
"pengrumahan": "perumahan",
|
||||
"penyewaan": "persewaan",
|
||||
"menyintai": "mencintai",
|
||||
"menyolok": "mencolok",
|
||||
"contek": "sontek",
|
||||
"mencontek": "menyontek",
|
||||
"pungkir": "mungkir",
|
||||
"dipungkiri": "dimungkiri",
|
||||
"kupungkiri": "kumungkiri",
|
||||
"kaupungkiri": "kaumungkiri",
|
||||
"nampak": "tampak",
|
||||
"nampaknya": "tampaknya",
|
||||
"nongkrong": "tongkrong",
|
||||
"berternak": "beternak",
|
||||
"berterbangan": "beterbangan",
|
||||
"berserta": "beserta",
|
||||
"berperkara": "beperkara",
|
||||
"berpergian": "bepergian",
|
||||
"berkerja": "bekerja",
|
||||
"berberapa": "beberapa",
|
||||
"terbersit": "tebersit",
|
||||
"terpercaya": "tepercaya",
|
||||
"terperdaya": "teperdaya",
|
||||
"terpercik": "tepercik",
|
||||
"terpergok": "tepergok",
|
||||
"aksesoris": "aksesori",
|
||||
"handal": "andal",
|
||||
"hantar": "antar",
|
||||
"panutan": "anutan",
|
||||
"atsiri": "asiri",
|
||||
"bhakti": "bakti",
|
||||
"china": "cina",
|
||||
"dharma": "darma",
|
||||
"diktaktor": "diktator",
|
||||
"eksport": "ekspor",
|
||||
"hembus": "embus",
|
||||
"hadits": "hadis",
|
||||
"hadist": "hadits",
|
||||
"harafiah": "harfiah",
|
||||
"himbau": "imbau",
|
||||
"import": "impor",
|
||||
"inget": "ingat",
|
||||
"hisap": "isap",
|
||||
"interprestasi": "interpretasi",
|
||||
"kangker": "kanker",
|
||||
"konggres": "kongres",
|
||||
"lansekap": "lanskap",
|
||||
"maghrib": "magrib",
|
||||
"emak": "mak",
|
||||
"moderen": "modern",
|
||||
"pasport": "paspor",
|
||||
"perduli": "peduli",
|
||||
"ramadhan": "ramadan",
|
||||
"rapih": "rapi",
|
||||
"Sansekerta": "Sanskerta",
|
||||
"shalat": "salat",
|
||||
"sholat": "salat",
|
||||
"silahkan": "silakan",
|
||||
"standard": "standar",
|
||||
"hutang": "utang",
|
||||
"zinah": "zina",
|
||||
"ambulan": "ambulans",
|
||||
"antartika": "sntarktika",
|
||||
"arteri": "arteria",
|
||||
"asik": "asyik",
|
||||
"australi": "australia",
|
||||
"denga": "dengan",
|
||||
"depo": "depot",
|
||||
"detil": "detail",
|
||||
"ensiklopedi": "ensiklopedia",
|
||||
"elit": "elite",
|
||||
"frustasi": "frustrasi",
|
||||
"gladi": "geladi",
|
||||
"greget": "gereget",
|
||||
"itali": "italia",
|
||||
"karna": "karena",
|
||||
"klenteng": "kelenteng",
|
||||
"erling": "kerling",
|
||||
"kontruksi": "konstruksi",
|
||||
"masal": "massal",
|
||||
"merk": "merek",
|
||||
"respon": "respons",
|
||||
"diresponi": "direspons",
|
||||
"skak": "sekak",
|
||||
"stir": "setir",
|
||||
"singapur": "singapura",
|
||||
"standarisasi": "standardisasi",
|
||||
"varitas": "varietas",
|
||||
"amphibi": "amfibi",
|
||||
"anjlog": "anjlok",
|
||||
"alpukat": "avokad",
|
||||
"alpokat": "avokad",
|
||||
"bolpen": "pulpen",
|
||||
"cabe": "cabai",
|
||||
"cabay": "cabai",
|
||||
"ceret": "cerek",
|
||||
"differensial": "diferensial",
|
||||
"duren": "durian",
|
||||
"faksimili": "faksimile",
|
||||
"faksimil": "faksimile",
|
||||
"graha": "gerha",
|
||||
"goblog": "goblok",
|
||||
"gombrong": "gombroh",
|
||||
"horden": "gorden",
|
||||
"korden": "gorden",
|
||||
"gubug": "gubuk",
|
||||
"imaginasi": "imajinasi",
|
||||
"jerigen": "jeriken",
|
||||
"jirigen": "jeriken",
|
||||
"carut-marut": "karut-marut",
|
||||
"kwota": "kuota",
|
||||
"mahzab": "mazhab",
|
||||
"mempesona": "memesona",
|
||||
"milyar": "miliar",
|
||||
"missi": "misi",
|
||||
"nenas": "nanas",
|
||||
"negoisasi": "negosiasi",
|
||||
"automotif": "otomotif",
|
||||
"pararel": "paralel",
|
||||
"paska": "pasca",
|
||||
"prosen": "persen",
|
||||
"pete": "petai",
|
||||
"petay": "petai",
|
||||
"proffesor": "profesor",
|
||||
"rame": "ramai",
|
||||
"rapot": "rapor",
|
||||
"rileks": "relaks",
|
||||
"rileksasi": "relaksasi",
|
||||
"renumerasi": "remunerasi",
|
||||
"seketaris": "sekretaris",
|
||||
"sekertaris": "sekretaris",
|
||||
"sensorik": "sensoris",
|
||||
"sentausa": "sentosa",
|
||||
"strawberi": "stroberi",
|
||||
"strawbery": "stroberi",
|
||||
"taqwa": "takwa",
|
||||
"tauco": "taoco",
|
||||
"tauge": "taoge",
|
||||
"toge": "taoge",
|
||||
"tauladan": "teladan",
|
||||
"taubat": "tobat",
|
||||
"trilyun": "triliun",
|
||||
"vissi": "visi",
|
||||
"coklat": "cokelat",
|
||||
"narkotika": "narkotik",
|
||||
"oase": "oasis",
|
||||
"politisi": "politikus",
|
||||
"terong": "terung",
|
||||
"wool": "wol",
|
||||
"himpit": "impit",
|
||||
"mujizat": "mukjizat",
|
||||
"mujijat": "mukjizat",
|
||||
"yag": "yang",
|
||||
}
|
||||
|
||||
NORM_EXCEPTIONS = {}
|
||||
|
||||
for string, norm in _exc.items():
|
||||
NORM_EXCEPTIONS[string] = norm
|
||||
NORM_EXCEPTIONS[string.title()] = norm
|
|
@ -2,26 +2,21 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
class LuxembourgishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "lb"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
|
|
|
@ -1,16 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# TODO
|
||||
# norm execptions: find a possibility to deal with the zillions of spelling
|
||||
# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
|
||||
# here one could include the most common spelling mistakes
|
||||
|
||||
_exc = {"dass": "datt", "viläicht": "vläicht"}
|
||||
|
||||
|
||||
NORM_EXCEPTIONS = {}
|
||||
|
||||
for string, norm in _exc.items():
|
||||
NORM_EXCEPTIONS[string] = norm
|
||||
NORM_EXCEPTIONS[string.title()] = norm
|
|
@ -186,10 +186,6 @@ def suffix(string):
|
|||
return string[-3:]
|
||||
|
||||
|
||||
def cluster(string):
|
||||
return 0
|
||||
|
||||
|
||||
def is_alpha(string):
|
||||
return string.isalpha()
|
||||
|
||||
|
@ -218,20 +214,11 @@ def is_stop(string, stops=set()):
|
|||
return string.lower() in stops
|
||||
|
||||
|
||||
def is_oov(string):
|
||||
return True
|
||||
|
||||
|
||||
def get_prob(string):
|
||||
return -20.0
|
||||
|
||||
|
||||
LEX_ATTRS = {
|
||||
attrs.LOWER: lower,
|
||||
attrs.NORM: lower,
|
||||
attrs.PREFIX: prefix,
|
||||
attrs.SUFFIX: suffix,
|
||||
attrs.CLUSTER: cluster,
|
||||
attrs.IS_ALPHA: is_alpha,
|
||||
attrs.IS_DIGIT: is_digit,
|
||||
attrs.IS_LOWER: is_lower,
|
||||
|
@ -239,8 +226,6 @@ LEX_ATTRS = {
|
|||
attrs.IS_TITLE: is_title,
|
||||
attrs.IS_UPPER: is_upper,
|
||||
attrs.IS_STOP: is_stop,
|
||||
attrs.IS_OOV: is_oov,
|
||||
attrs.PROB: get_prob,
|
||||
attrs.LIKE_EMAIL: like_email,
|
||||
attrs.LIKE_NUM: like_num,
|
||||
attrs.IS_PUNCT: is_punct,
|
||||
|
|
|
@ -5,22 +5,17 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .tag_map import TAG_MAP
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
class PortugueseDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "pt"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
|
||||
)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
|
|
|
@ -1,23 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# These exceptions are used to add NORM values based on a token's ORTH value.
|
||||
# Individual languages can also add their own exceptions and overwrite them -
|
||||
# for example, British vs. American spelling in English.
|
||||
|
||||
# Norms are only set if no alternative is provided in the tokenizer exceptions.
|
||||
# Note that this does not change any other token attributes. Its main purpose
|
||||
# is to normalise the word representations so that equivalent tokens receive
|
||||
# similar representations. For example: $ and € are very different, but they're
|
||||
# both currency symbols. By normalising currency symbols to $, all symbols are
|
||||
# seen as similar, no matter how common they are in the training data.
|
||||
|
||||
|
||||
NORM_EXCEPTIONS = {
|
||||
"R$": "$", # Real
|
||||
"r$": "$", # Real
|
||||
"Cz$": "$", # Cruzado
|
||||
"cz$": "$", # Cruzado
|
||||
"NCz$": "$", # Cruzado Novo
|
||||
"ncz$": "$", # Cruzado Novo
|
||||
}
|
|
@ -3,26 +3,21 @@ from __future__ import unicode_literals, print_function
|
|||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .tag_map import TAG_MAP
|
||||
from .lemmatizer import RussianLemmatizer
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...attrs import LANG, NORM
|
||||
from ...attrs import LANG
|
||||
|
||||
|
||||
class RussianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "ru"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
|
|
|
@ -1,36 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
_exc = {
|
||||
# Slang
|
||||
"прив": "привет",
|
||||
"дарова": "привет",
|
||||
"дак": "так",
|
||||
"дык": "так",
|
||||
"здарова": "привет",
|
||||
"пакедава": "пока",
|
||||
"пакедаво": "пока",
|
||||
"ща": "сейчас",
|
||||
"спс": "спасибо",
|
||||
"пжлст": "пожалуйста",
|
||||
"плиз": "пожалуйста",
|
||||
"ладненько": "ладно",
|
||||
"лады": "ладно",
|
||||
"лан": "ладно",
|
||||
"ясн": "ясно",
|
||||
"всм": "всмысле",
|
||||
"хош": "хочешь",
|
||||
"хаюшки": "привет",
|
||||
"оч": "очень",
|
||||
"че": "что",
|
||||
"чо": "что",
|
||||
"шо": "что",
|
||||
}
|
||||
|
||||
|
||||
NORM_EXCEPTIONS = {}
|
||||
|
||||
for string, norm in _exc.items():
|
||||
NORM_EXCEPTIONS[string] = norm
|
||||
NORM_EXCEPTIONS[string.title()] = norm
|
|
@ -3,22 +3,17 @@ from __future__ import unicode_literals
|
|||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
class SerbianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "sr"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
|
|
@ -1,26 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
_exc = {
|
||||
# Slang
|
||||
"ћале": "отац",
|
||||
"кева": "мајка",
|
||||
"смор": "досада",
|
||||
"кец": "јединица",
|
||||
"тебра": "брат",
|
||||
"штребер": "ученик",
|
||||
"факс": "факултет",
|
||||
"профа": "професор",
|
||||
"бус": "аутобус",
|
||||
"пискарало": "службеник",
|
||||
"бакутанер": "бака",
|
||||
"џибер": "простак",
|
||||
}
|
||||
|
||||
|
||||
NORM_EXCEPTIONS = {}
|
||||
|
||||
for string, norm in _exc.items():
|
||||
NORM_EXCEPTIONS[string] = norm
|
||||
NORM_EXCEPTIONS[string.title()] = norm
|
|
@ -1,139 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
_exc = {
|
||||
# Regional words normal
|
||||
# Sri Lanka - wikipeadia
|
||||
"இங்க": "இங்கே",
|
||||
"வாங்க": "வாருங்கள்",
|
||||
"ஒண்டு": "ஒன்று",
|
||||
"கண்டு": "கன்று",
|
||||
"கொண்டு": "கொன்று",
|
||||
"பண்டி": "பன்றி",
|
||||
"பச்ச": "பச்சை",
|
||||
"அம்பது": "ஐம்பது",
|
||||
"வெச்ச": "வைத்து",
|
||||
"வச்ச": "வைத்து",
|
||||
"வச்சி": "வைத்து",
|
||||
"வாளைப்பழம்": "வாழைப்பழம்",
|
||||
"மண்ணு": "மண்",
|
||||
"பொன்னு": "பொன்",
|
||||
"சாவல்": "சேவல்",
|
||||
"அங்கால": "அங்கு ",
|
||||
"அசுப்பு": "நடமாட்டம்",
|
||||
"எழுவான் கரை": "எழுவான்கரை",
|
||||
"ஓய்யாரம்": "எழில் ",
|
||||
"ஒளும்பு": "எழும்பு",
|
||||
"ஓர்மை": "துணிவு",
|
||||
"கச்சை": "கோவணம்",
|
||||
"கடப்பு": "தெருவாசல்",
|
||||
"சுள்ளி": "காய்ந்த குச்சி",
|
||||
"திறாவுதல்": "தடவுதல்",
|
||||
"நாசமறுப்பு": "தொல்லை",
|
||||
"பரிசாரி": "வைத்தியன்",
|
||||
"பறவாதி": "பேராசைக்காரன்",
|
||||
"பிசினி": "உலோபி ",
|
||||
"விசர்": "பைத்தியம்",
|
||||
"ஏனம்": "பாத்திரம்",
|
||||
"ஏலா": "இயலாது",
|
||||
"ஒசில்": "அழகு",
|
||||
"ஒள்ளுப்பம்": "கொஞ்சம்",
|
||||
# Srilankan and indian
|
||||
"குத்துமதிப்பு": "",
|
||||
"நூனாயம்": "நூல்நயம்",
|
||||
"பைய": "மெதுவாக",
|
||||
"மண்டை": "தலை",
|
||||
"வெள்ளனே": "சீக்கிரம்",
|
||||
"உசுப்பு": "எழுப்பு",
|
||||
"ஆணம்": "குழம்பு",
|
||||
"உறக்கம்": "தூக்கம்",
|
||||
"பஸ்": "பேருந்து",
|
||||
"களவு": "திருட்டு ",
|
||||
# relationship
|
||||
"புருசன்": "கணவன்",
|
||||
"பொஞ்சாதி": "மனைவி",
|
||||
"புள்ள": "பிள்ளை",
|
||||
"பிள்ள": "பிள்ளை",
|
||||
"ஆம்பிளப்புள்ள": "ஆண் பிள்ளை",
|
||||
"பொம்பிளப்புள்ள": "பெண் பிள்ளை",
|
||||
"அண்ணாச்சி": "அண்ணா",
|
||||
"அக்காச்சி": "அக்கா",
|
||||
"தங்கச்சி": "தங்கை",
|
||||
# difference words
|
||||
"பொடியன்": "சிறுவன்",
|
||||
"பொட்டை": "சிறுமி",
|
||||
"பிறகு": "பின்பு",
|
||||
"டக்கென்டு": "விரைவாக",
|
||||
"கெதியா": "விரைவாக",
|
||||
"கிறுகி": "திரும்பி",
|
||||
"போயித்து வாறன்": "போய் வருகிறேன்",
|
||||
"வருவாங்களா": "வருவார்களா",
|
||||
# regular spokens
|
||||
"சொல்லு": "சொல்",
|
||||
"கேளு": "கேள்",
|
||||
"சொல்லுங்க": "சொல்லுங்கள்",
|
||||
"கேளுங்க": "கேளுங்கள்",
|
||||
"நீங்கள்": "நீ",
|
||||
"உன்": "உன்னுடைய",
|
||||
# Portugeese formal words
|
||||
"அலவாங்கு": "கடப்பாரை",
|
||||
"ஆசுப்பத்திரி": "மருத்துவமனை",
|
||||
"உரோதை": "சில்லு",
|
||||
"கடுதாசி": "கடிதம்",
|
||||
"கதிரை": "நாற்காலி",
|
||||
"குசினி": "அடுக்களை",
|
||||
"கோப்பை": "கிண்ணம்",
|
||||
"சப்பாத்து": "காலணி",
|
||||
"தாச்சி": "இரும்புச் சட்டி",
|
||||
"துவாய்": "துவாலை",
|
||||
"தவறணை": "மதுக்கடை",
|
||||
"பீப்பா": "மரத்தாழி",
|
||||
"யன்னல்": "சாளரம்",
|
||||
"வாங்கு": "மரஇருக்கை",
|
||||
# Dutch formal words
|
||||
"இறாக்கை": "பற்சட்டம்",
|
||||
"இலாட்சி": "இழுப்பறை",
|
||||
"கந்தோர்": "பணிமனை",
|
||||
"நொத்தாரிசு": "ஆவண எழுத்துபதிவாளர்",
|
||||
# English formal words
|
||||
"இஞ்சினியர்": "பொறியியலாளர்",
|
||||
"சூப்பு": "ரசம்",
|
||||
"செக்": "காசோலை",
|
||||
"சேட்டு": "மேற்ச்சட்டை",
|
||||
"மார்க்கட்டு": "சந்தை",
|
||||
"விண்ணன்": "கெட்டிக்காரன்",
|
||||
# Arabic formal words
|
||||
"ஈமான்": "நம்பிக்கை",
|
||||
"சுன்னத்து": "விருத்தசேதனம்",
|
||||
"செய்த்தான்": "பிசாசு",
|
||||
"மவுத்து": "இறப்பு",
|
||||
"ஹலால்": "அங்கீகரிக்கப்பட்டது",
|
||||
"கறாம்": "நிராகரிக்கப்பட்டது",
|
||||
# Persian, Hindustanian and hindi formal words
|
||||
"சுமார்": "கிட்டத்தட்ட",
|
||||
"சிப்பாய்": "போர்வீரன்",
|
||||
"சிபார்சு": "சிபாரிசு",
|
||||
"ஜமீன்": "பணக்காரா்",
|
||||
"அசல்": "மெய்யான",
|
||||
"அந்தஸ்து": "கௌரவம்",
|
||||
"ஆஜர்": "சமா்ப்பித்தல்",
|
||||
"உசார்": "எச்சரிக்கை",
|
||||
"அச்சா": "நல்ல",
|
||||
# English words used in text conversations
|
||||
"bcoz": "ஏனெனில்",
|
||||
"bcuz": "ஏனெனில்",
|
||||
"fav": "விருப்பமான",
|
||||
"morning": "காலை வணக்கம்",
|
||||
"gdeveng": "மாலை வணக்கம்",
|
||||
"gdnyt": "இரவு வணக்கம்",
|
||||
"gdnit": "இரவு வணக்கம்",
|
||||
"plz": "தயவு செய்து",
|
||||
"pls": "தயவு செய்து",
|
||||
"thx": "நன்றி",
|
||||
"thanx": "நன்றி",
|
||||
}
|
||||
|
||||
NORM_EXCEPTIONS = {}
|
||||
|
||||
for string, norm in _exc.items():
|
||||
NORM_EXCEPTIONS[string] = norm
|
|
@ -4,14 +4,12 @@ from __future__ import unicode_literals
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...attrs import LANG, NORM
|
||||
from ...attrs import LANG
|
||||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from ...util import DummyTokenizer, add_lookups
|
||||
from ...util import DummyTokenizer
|
||||
|
||||
|
||||
class ThaiTokenizer(DummyTokenizer):
|
||||
|
@ -37,9 +35,6 @@ class ThaiDefaults(Language.Defaults):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda _text: "th"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
|
||||
)
|
||||
tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
|
|
|
@ -1,113 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
_exc = {
|
||||
# Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์)
|
||||
"สนุ๊กเกอร์": "สนุกเกอร์",
|
||||
"โน้ต": "โน้ต",
|
||||
# Misspelled because of being lazy or hustle (สะกดผิดเพราะขี้เกียจพิมพ์ หรือเร่งรีบ)
|
||||
"โทสับ": "โทรศัพท์",
|
||||
"พุ่งนี้": "พรุ่งนี้",
|
||||
# Strange (ให้ดูแปลกตา)
|
||||
"ชะมะ": "ใช่ไหม",
|
||||
"ชิมิ": "ใช่ไหม",
|
||||
"ชะ": "ใช่ไหม",
|
||||
"ช่ายมะ": "ใช่ไหม",
|
||||
"ป่าว": "เปล่า",
|
||||
"ป่ะ": "เปล่า",
|
||||
"ปล่าว": "เปล่า",
|
||||
"คัย": "ใคร",
|
||||
"ไค": "ใคร",
|
||||
"คราย": "ใคร",
|
||||
"เตง": "ตัวเอง",
|
||||
"ตะเอง": "ตัวเอง",
|
||||
"รึ": "หรือ",
|
||||
"เหรอ": "หรือ",
|
||||
"หรา": "หรือ",
|
||||
"หรอ": "หรือ",
|
||||
"ชั้น": "ฉัน",
|
||||
"ชั้ล": "ฉัน",
|
||||
"ช้าน": "ฉัน",
|
||||
"เทอ": "เธอ",
|
||||
"เทอร์": "เธอ",
|
||||
"เทอว์": "เธอ",
|
||||
"แกร": "แก",
|
||||
"ป๋ม": "ผม",
|
||||
"บ่องตง": "บอกตรงๆ",
|
||||
"ถ่ามตง": "ถามตรงๆ",
|
||||
"ต่อมตง": "ตอบตรงๆ",
|
||||
"เพิ่ล": "เพื่อน",
|
||||
"จอบอ": "จอบอ",
|
||||
"ดั้ย": "ได้",
|
||||
"ขอบคุง": "ขอบคุณ",
|
||||
"ยังงัย": "ยังไง",
|
||||
"Inw": "เทพ",
|
||||
"uou": "นอน",
|
||||
"Lกรีeu": "เกรียน",
|
||||
# Misspelled to express emotions (คำที่สะกดผิดเพื่อแสดงอารมณ์)
|
||||
"เปงราย": "เป็นอะไร",
|
||||
"เปนรัย": "เป็นอะไร",
|
||||
"เปงรัย": "เป็นอะไร",
|
||||
"เป็นอัลไล": "เป็นอะไร",
|
||||
"ทามมาย": "ทำไม",
|
||||
"ทามมัย": "ทำไม",
|
||||
"จังรุย": "จังเลย",
|
||||
"จังเยย": "จังเลย",
|
||||
"จุงเบย": "จังเลย",
|
||||
"ไม่รู้": "มะรุ",
|
||||
"เฮ่ย": "เฮ้ย",
|
||||
"เห้ย": "เฮ้ย",
|
||||
"น่าร็อค": "น่ารัก",
|
||||
"น่าร๊าก": "น่ารัก",
|
||||
"ตั้ลล๊าก": "น่ารัก",
|
||||
"คือร๊ะ": "คืออะไร",
|
||||
"โอป่ะ": "โอเคหรือเปล่า",
|
||||
"น่ามคาน": "น่ารำคาญ",
|
||||
"น่ามสาร": "น่าสงสาร",
|
||||
"วงวาร": "สงสาร",
|
||||
"บับว่า": "แบบว่า",
|
||||
"อัลไล": "อะไร",
|
||||
"อิจ": "อิจฉา",
|
||||
# Reduce rough words or Avoid to software filter (คำที่สะกดผิดเพื่อลดความหยาบของคำ หรืออาจใช้หลีกเลี่ยงการกรองคำหยาบของซอฟต์แวร์)
|
||||
"กรู": "กู",
|
||||
"กุ": "กู",
|
||||
"กรุ": "กู",
|
||||
"ตู": "กู",
|
||||
"ตรู": "กู",
|
||||
"มรึง": "มึง",
|
||||
"เมิง": "มึง",
|
||||
"มืง": "มึง",
|
||||
"มุง": "มึง",
|
||||
"สาด": "สัตว์",
|
||||
"สัส": "สัตว์",
|
||||
"สัก": "สัตว์",
|
||||
"แสรด": "สัตว์",
|
||||
"โคโตะ": "โคตร",
|
||||
"โคด": "โคตร",
|
||||
"โครต": "โคตร",
|
||||
"โคตะระ": "โคตร",
|
||||
"พ่อง": "พ่อมึง",
|
||||
"แม่เมิง": "แม่มึง",
|
||||
"เชี่ย": "เหี้ย",
|
||||
# Imitate words (คำเลียนเสียง โดยส่วนใหญ่จะเพิ่มทัณฑฆาต หรือซ้ำตัวอักษร)
|
||||
"แอร๊ยย": "อ๊าย",
|
||||
"อร๊ายยย": "อ๊าย",
|
||||
"มันส์": "มัน",
|
||||
"วู๊วววววววว์": "วู้",
|
||||
# Acronym (แบบคำย่อ)
|
||||
"หมาลัย": "มหาวิทยาลัย",
|
||||
"วิดวะ": "วิศวะ",
|
||||
"สินสาด ": "ศิลปศาสตร์",
|
||||
"สินกำ ": "ศิลปกรรมศาสตร์",
|
||||
"เสารีย์ ": "อนุเสาวรีย์ชัยสมรภูมิ",
|
||||
"เมกา ": "อเมริกา",
|
||||
"มอไซค์ ": "มอเตอร์ไซค์",
|
||||
}
|
||||
|
||||
|
||||
NORM_EXCEPTIONS = {}
|
||||
|
||||
for string, norm in _exc.items():
|
||||
NORM_EXCEPTIONS[string] = norm
|
||||
NORM_EXCEPTIONS[string.title()] = norm
|
|
@ -28,10 +28,11 @@ from .compat import izip, basestring_, is_python2, class_types
|
|||
from .gold import GoldParse
|
||||
from .scorer import Scorer
|
||||
from ._ml import link_vectors_to_models, create_default_optimizer
|
||||
from .attrs import IS_STOP, LANG
|
||||
from .attrs import IS_STOP, LANG, NORM
|
||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .lang.punctuation import TOKENIZER_INFIXES
|
||||
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
||||
from .lang.norm_exceptions import BASE_NORMS
|
||||
from .lang.tag_map import TAG_MAP
|
||||
from .tokens import Doc
|
||||
from .lang.lex_attrs import LEX_ATTRS, is_stop
|
||||
|
@ -77,6 +78,9 @@ class BaseDefaults(object):
|
|||
lemmatizer=lemmatizer,
|
||||
lookups=lookups,
|
||||
)
|
||||
vocab.lex_attr_getters[NORM] = util.add_lookups(
|
||||
vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, vocab.lookups.get_table("lexeme_norm")
|
||||
)
|
||||
for tag_str, exc in cls.morph_rules.items():
|
||||
for orth_str, attrs in exc.items():
|
||||
vocab.morphology.add_special_case(tag_str, orth_str, attrs)
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
|
||||
from .attrs cimport attr_id_t
|
||||
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG
|
||||
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
|
||||
|
||||
from .structs cimport LexemeC, SerializedLexemeC
|
||||
from .structs cimport LexemeC
|
||||
from .strings cimport StringStore
|
||||
from .vocab cimport Vocab
|
||||
|
||||
|
@ -24,22 +24,6 @@ cdef class Lexeme:
|
|||
self.vocab = vocab
|
||||
self.orth = lex.orth
|
||||
|
||||
@staticmethod
|
||||
cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil:
|
||||
cdef SerializedLexemeC lex_data
|
||||
buff = <const unsigned char*>&lex.flags
|
||||
end = <const unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
|
||||
for i in range(sizeof(lex_data.data)):
|
||||
lex_data.data[i] = buff[i]
|
||||
return lex_data
|
||||
|
||||
@staticmethod
|
||||
cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil:
|
||||
buff = <unsigned char*>&lex.flags
|
||||
end = <unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
|
||||
for i in range(sizeof(lex_data.data)):
|
||||
buff[i] = lex_data.data[i]
|
||||
|
||||
@staticmethod
|
||||
cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
|
||||
if name < (sizeof(flags_t) * 8):
|
||||
|
@ -56,8 +40,6 @@ cdef class Lexeme:
|
|||
lex.prefix = value
|
||||
elif name == SUFFIX:
|
||||
lex.suffix = value
|
||||
elif name == CLUSTER:
|
||||
lex.cluster = value
|
||||
elif name == LANG:
|
||||
lex.lang = value
|
||||
|
||||
|
@ -84,8 +66,6 @@ cdef class Lexeme:
|
|||
return lex.suffix
|
||||
elif feat_name == LENGTH:
|
||||
return lex.length
|
||||
elif feat_name == CLUSTER:
|
||||
return lex.cluster
|
||||
elif feat_name == LANG:
|
||||
return lex.lang
|
||||
else:
|
||||
|
|
|
@ -17,7 +17,7 @@ from .typedefs cimport attr_t, flags_t
|
|||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
|
||||
from .attrs cimport IS_CURRENCY, IS_OOV, PROB
|
||||
from .attrs cimport IS_CURRENCY
|
||||
|
||||
from .attrs import intify_attrs
|
||||
from .errors import Errors, Warnings
|
||||
|
@ -89,12 +89,11 @@ cdef class Lexeme:
|
|||
cdef attr_id_t attr
|
||||
attrs = intify_attrs(attrs)
|
||||
for attr, value in attrs.items():
|
||||
if attr == PROB:
|
||||
self.c.prob = value
|
||||
elif attr == CLUSTER:
|
||||
self.c.cluster = int(value)
|
||||
elif isinstance(value, int) or isinstance(value, long):
|
||||
Lexeme.set_struct_attr(self.c, attr, value)
|
||||
# skip PROB, e.g. from lexemes.jsonl
|
||||
if isinstance(value, float):
|
||||
continue
|
||||
elif isinstance(value, (int, long)):
|
||||
Lexeme.set_struct_attr(self.c, attr, value)
|
||||
else:
|
||||
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
|
||||
|
||||
|
@ -137,34 +136,6 @@ cdef class Lexeme:
|
|||
xp = get_array_module(vector)
|
||||
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
|
||||
|
||||
def to_bytes(self):
|
||||
lex_data = Lexeme.c_to_bytes(self.c)
|
||||
start = <const char*>&self.c.flags
|
||||
end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment)
|
||||
if (end-start) != sizeof(lex_data.data):
|
||||
raise ValueError(Errors.E072.format(length=end-start,
|
||||
bad_length=sizeof(lex_data.data)))
|
||||
byte_string = b"\0" * sizeof(lex_data.data)
|
||||
byte_chars = <char*>byte_string
|
||||
for i in range(sizeof(lex_data.data)):
|
||||
byte_chars[i] = lex_data.data[i]
|
||||
if len(byte_string) != sizeof(lex_data.data):
|
||||
raise ValueError(Errors.E072.format(length=len(byte_string),
|
||||
bad_length=sizeof(lex_data.data)))
|
||||
return byte_string
|
||||
|
||||
def from_bytes(self, bytes byte_string):
|
||||
# This method doesn't really have a use-case --- wrote it for testing.
|
||||
# Possibly delete? It puts the Lexeme out of synch with the vocab.
|
||||
cdef SerializedLexemeC lex_data
|
||||
if len(byte_string) != sizeof(lex_data.data):
|
||||
raise ValueError(Errors.E072.format(length=len(byte_string),
|
||||
bad_length=sizeof(lex_data.data)))
|
||||
for i in range(len(byte_string)):
|
||||
lex_data.data[i] = byte_string[i]
|
||||
Lexeme.c_from_bytes(self.c, lex_data)
|
||||
self.orth = self.c.orth
|
||||
|
||||
@property
|
||||
def has_vector(self):
|
||||
"""RETURNS (bool): Whether a word vector is associated with the object.
|
||||
|
@ -208,10 +179,14 @@ cdef class Lexeme:
|
|||
"""RETURNS (float): A scalar value indicating the positivity or
|
||||
negativity of the lexeme."""
|
||||
def __get__(self):
|
||||
return self.c.sentiment
|
||||
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
|
||||
return sentiment_table.get(self.c.orth, 0.0)
|
||||
|
||||
def __set__(self, float sentiment):
|
||||
self.c.sentiment = sentiment
|
||||
def __set__(self, float x):
|
||||
if "lexeme_sentiment" not in self.vocab.lookups:
|
||||
self.vocab.lookups.add_table("lexeme_sentiment")
|
||||
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
|
||||
sentiment_table[self.c.orth] = x
|
||||
|
||||
@property
|
||||
def orth_(self):
|
||||
|
@ -238,9 +213,13 @@ cdef class Lexeme:
|
|||
lexeme text.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.norm
|
||||
return self.c.norm
|
||||
|
||||
def __set__(self, attr_t x):
|
||||
if "lexeme_norm" not in self.vocab.lookups:
|
||||
self.vocab.lookups.add_table("lexeme_norm")
|
||||
norm_table = self.vocab.lookups.get_table("lexeme_norm")
|
||||
norm_table[self.c.orth] = self.vocab.strings[x]
|
||||
self.c.norm = x
|
||||
|
||||
property shape:
|
||||
|
@ -276,10 +255,12 @@ cdef class Lexeme:
|
|||
property cluster:
|
||||
"""RETURNS (int): Brown cluster ID."""
|
||||
def __get__(self):
|
||||
return self.c.cluster
|
||||
cluster_table = self.vocab.load_extra_lookups("lexeme_cluster")
|
||||
return cluster_table.get(self.c.orth, 0)
|
||||
|
||||
def __set__(self, attr_t x):
|
||||
self.c.cluster = x
|
||||
def __set__(self, int x):
|
||||
cluster_table = self.vocab.load_extra_lookups("lexeme_cluster")
|
||||
cluster_table[self.c.orth] = x
|
||||
|
||||
property lang:
|
||||
"""RETURNS (uint64): Language of the parent vocabulary."""
|
||||
|
@ -293,10 +274,14 @@ cdef class Lexeme:
|
|||
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
|
||||
type."""
|
||||
def __get__(self):
|
||||
return self.c.prob
|
||||
prob_table = self.vocab.load_extra_lookups("lexeme_prob")
|
||||
settings_table = self.vocab.load_extra_lookups("lexeme_settings")
|
||||
default_oov_prob = settings_table.get("oov_prob", -20.0)
|
||||
return prob_table.get(self.c.orth, default_oov_prob)
|
||||
|
||||
def __set__(self, float x):
|
||||
self.c.prob = x
|
||||
prob_table = self.vocab.load_extra_lookups("lexeme_prob")
|
||||
prob_table[self.c.orth] = x
|
||||
|
||||
property lower_:
|
||||
"""RETURNS (unicode): Lowercase form of the word."""
|
||||
|
@ -314,7 +299,7 @@ cdef class Lexeme:
|
|||
return self.vocab.strings[self.c.norm]
|
||||
|
||||
def __set__(self, unicode x):
|
||||
self.c.norm = self.vocab.strings.add(x)
|
||||
self.norm = self.vocab.strings.add(x)
|
||||
|
||||
property shape_:
|
||||
"""RETURNS (unicode): Transform of the word's string, to show
|
||||
|
@ -362,13 +347,10 @@ cdef class Lexeme:
|
|||
def __set__(self, flags_t x):
|
||||
self.c.flags = x
|
||||
|
||||
property is_oov:
|
||||
@property
|
||||
def is_oov(self):
|
||||
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c, IS_OOV)
|
||||
|
||||
def __set__(self, attr_t x):
|
||||
Lexeme.c_set_flag(self.c, IS_OOV, x)
|
||||
return self.orth in self.vocab.vectors
|
||||
|
||||
property is_stop:
|
||||
"""RETURNS (bool): Whether the lexeme is a stop word."""
|
||||
|
|
|
@ -124,7 +124,7 @@ class Lookups(object):
|
|||
self._tables[key].update(value)
|
||||
return self
|
||||
|
||||
def to_disk(self, path, **kwargs):
|
||||
def to_disk(self, path, filename="lookups.bin", **kwargs):
|
||||
"""Save the lookups to a directory as lookups.bin. Expects a path to a
|
||||
directory, which will be created if it doesn't exist.
|
||||
|
||||
|
@ -136,11 +136,11 @@ class Lookups(object):
|
|||
path = ensure_path(path)
|
||||
if not path.exists():
|
||||
path.mkdir()
|
||||
filepath = path / "lookups.bin"
|
||||
filepath = path / filename
|
||||
with filepath.open("wb") as file_:
|
||||
file_.write(self.to_bytes())
|
||||
|
||||
def from_disk(self, path, **kwargs):
|
||||
def from_disk(self, path, filename="lookups.bin", **kwargs):
|
||||
"""Load lookups from a directory containing a lookups.bin. Will skip
|
||||
loading if the file doesn't exist.
|
||||
|
||||
|
@ -150,7 +150,7 @@ class Lookups(object):
|
|||
DOCS: https://spacy.io/api/lookups#from_disk
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
filepath = path / "lookups.bin"
|
||||
filepath = path / filename
|
||||
if filepath.exists():
|
||||
with filepath.open("rb") as file_:
|
||||
data = file_.read()
|
||||
|
|
|
@ -23,29 +23,6 @@ cdef struct LexemeC:
|
|||
attr_t prefix
|
||||
attr_t suffix
|
||||
|
||||
attr_t cluster
|
||||
|
||||
float prob
|
||||
float sentiment
|
||||
|
||||
|
||||
cdef struct SerializedLexemeC:
|
||||
unsigned char[8 + 8*10 + 4 + 4] data
|
||||
# sizeof(flags_t) # flags
|
||||
# + sizeof(attr_t) # lang
|
||||
# + sizeof(attr_t) # id
|
||||
# + sizeof(attr_t) # length
|
||||
# + sizeof(attr_t) # orth
|
||||
# + sizeof(attr_t) # lower
|
||||
# + sizeof(attr_t) # norm
|
||||
# + sizeof(attr_t) # shape
|
||||
# + sizeof(attr_t) # prefix
|
||||
# + sizeof(attr_t) # suffix
|
||||
# + sizeof(attr_t) # cluster
|
||||
# + sizeof(float) # prob
|
||||
# + sizeof(float) # cluster
|
||||
# + sizeof(float) # l2_norm
|
||||
|
||||
|
||||
cdef struct SpanC:
|
||||
hash_t id
|
||||
|
|
|
@ -12,7 +12,7 @@ cdef enum symbol_t:
|
|||
LIKE_NUM
|
||||
LIKE_EMAIL
|
||||
IS_STOP
|
||||
IS_OOV
|
||||
IS_OOV_DEPRECATED
|
||||
IS_BRACKET
|
||||
IS_QUOTE
|
||||
IS_LEFT_PUNCT
|
||||
|
|
|
@ -17,7 +17,7 @@ IDS = {
|
|||
"LIKE_NUM": LIKE_NUM,
|
||||
"LIKE_EMAIL": LIKE_EMAIL,
|
||||
"IS_STOP": IS_STOP,
|
||||
"IS_OOV": IS_OOV,
|
||||
"IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
|
||||
"IS_BRACKET": IS_BRACKET,
|
||||
"IS_QUOTE": IS_QUOTE,
|
||||
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
|
||||
|
|
|
@ -37,14 +37,6 @@ def test_da_tokenizer_handles_custom_base_exc(da_tokenizer):
|
|||
assert tokens[7].text == "."
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,norm", [("akvarium", "akvarie"), ("bedstemoder", "bedstemor")]
|
||||
)
|
||||
def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm):
|
||||
tokens = da_tokenizer(text)
|
||||
assert tokens[0].norm_ == norm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,n_tokens",
|
||||
[
|
||||
|
|
|
@ -22,17 +22,3 @@ def test_de_tokenizer_handles_exc_in_text(de_tokenizer):
|
|||
assert len(tokens) == 6
|
||||
assert tokens[2].text == "z.Zt."
|
||||
assert tokens[2].lemma_ == "zur Zeit"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,norms", [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])]
|
||||
)
|
||||
def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
|
||||
tokens = de_tokenizer(text)
|
||||
assert [token.norm_ for token in tokens] == norms
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,norm", [("daß", "dass")])
|
||||
def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
|
||||
tokens = de_tokenizer(text)
|
||||
assert tokens[0].norm_ == norm
|
||||
|
|
|
@ -118,6 +118,7 @@ def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
|
|||
assert [token.norm_ for token in tokens] == norms
|
||||
|
||||
|
||||
@pytest.mark.skip
|
||||
@pytest.mark.parametrize(
|
||||
"text,norm", [("radicalised", "radicalized"), ("cuz", "because")]
|
||||
)
|
||||
|
|
|
@ -22,9 +22,3 @@ def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
|
|||
assert len(tokens) == 9
|
||||
assert tokens[1].text == "'t"
|
||||
assert tokens[1].lemma_ == "et"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")])
|
||||
def test_lb_norm_exceptions(lb_tokenizer, text, norm):
|
||||
tokens = lb_tokenizer(text)
|
||||
assert tokens[0].norm_ == norm
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
import pickle
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.strings import StringStore
|
||||
|
||||
|
@ -36,8 +37,8 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
|
|||
assert vocab1.to_bytes() == vocab1_b
|
||||
new_vocab1 = Vocab().from_bytes(vocab1_b)
|
||||
assert new_vocab1.to_bytes() == vocab1_b
|
||||
assert len(new_vocab1) == len(strings1)
|
||||
assert sorted([lex.text for lex in new_vocab1]) == sorted(strings1)
|
||||
assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP
|
||||
assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + ["_SP"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
||||
|
@ -51,12 +52,13 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
|
|||
vocab2.to_disk(file_path2)
|
||||
vocab1_d = Vocab().from_disk(file_path1)
|
||||
vocab2_d = Vocab().from_disk(file_path2)
|
||||
assert list(vocab1_d) == list(vocab1)
|
||||
assert list(vocab2_d) == list(vocab2)
|
||||
# check strings rather than lexemes, which are only reloaded on demand
|
||||
assert strings1 == [s for s in vocab1_d.strings if s != "_SP"]
|
||||
assert strings2 == [s for s in vocab2_d.strings if s != "_SP"]
|
||||
if strings1 == strings2:
|
||||
assert list(vocab1_d) == list(vocab2_d)
|
||||
assert [s for s in vocab1_d.strings if s != "_SP"] == [s for s in vocab2_d.strings if s != "_SP"]
|
||||
else:
|
||||
assert list(vocab1_d) != list(vocab2_d)
|
||||
assert [s for s in vocab1_d.strings if s != "_SP"] != [s for s in vocab2_d.strings if s != "_SP"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
||||
|
@ -76,7 +78,7 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr):
|
|||
vocab = Vocab(strings=strings)
|
||||
length = len(vocab)
|
||||
vocab.from_bytes(vocab.to_bytes())
|
||||
assert len(vocab) == length
|
||||
assert len(vocab.strings) == len(strings) + 1 # adds _SP
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
||||
|
@ -127,3 +129,11 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
|
|||
assert list(sstore1_d) == list(sstore2_d)
|
||||
else:
|
||||
assert list(sstore1_d) != list(sstore2_d)
|
||||
|
||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
||||
def test_pickle_vocab(strings, lex_attr):
|
||||
vocab = Vocab(strings=strings)
|
||||
vocab[strings[0]].norm_ = lex_attr
|
||||
vocab_pickled = pickle.dumps(vocab)
|
||||
vocab_unpickled = pickle.loads(vocab_pickled)
|
||||
assert vocab.to_bytes() == vocab_unpickled.to_bytes()
|
||||
|
|
|
@ -26,7 +26,7 @@ def test_lemmatizer_reflects_lookups_changes():
|
|||
nlp_bytes = nlp.to_bytes()
|
||||
new_nlp.from_bytes(nlp_bytes)
|
||||
# Make sure we have the previously saved lookup table
|
||||
assert len(new_nlp.vocab.lookups) == 1
|
||||
assert "lemma_lookup" in new_nlp.vocab.lookups
|
||||
assert len(new_nlp.vocab.lookups.get_table("lemma_lookup")) == 2
|
||||
assert new_nlp.vocab.lookups.get_table("lemma_lookup")["hello"] == "world"
|
||||
assert Doc(new_nlp.vocab, words=["foo"])[0].lemma_ == "bar"
|
||||
|
|
|
@ -60,19 +60,6 @@ def test_vocab_lexeme_add_flag_provided_id(en_vocab):
|
|||
assert en_vocab["dogs"].check_flag(is_len4) is True
|
||||
|
||||
|
||||
def test_lexeme_bytes_roundtrip(en_vocab):
|
||||
one = en_vocab["one"]
|
||||
alpha = en_vocab["alpha"]
|
||||
assert one.orth != alpha.orth
|
||||
assert one.lower != alpha.lower
|
||||
alpha.from_bytes(one.to_bytes())
|
||||
|
||||
assert one.orth_ == alpha.orth_
|
||||
assert one.orth == alpha.orth
|
||||
assert one.lower == alpha.lower
|
||||
assert one.lower_ == alpha.lower_
|
||||
|
||||
|
||||
def test_vocab_lexeme_oov_rank(en_vocab):
|
||||
"""Test that default rank is OOV_RANK."""
|
||||
lex = en_vocab["word"]
|
||||
|
|
|
@ -119,12 +119,11 @@ def test_lookups_to_from_bytes_via_vocab():
|
|||
table_name = "test"
|
||||
vocab = Vocab()
|
||||
vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"})
|
||||
assert len(vocab.lookups) == 1
|
||||
assert table_name in vocab.lookups
|
||||
vocab_bytes = vocab.to_bytes()
|
||||
new_vocab = Vocab()
|
||||
new_vocab.from_bytes(vocab_bytes)
|
||||
assert len(new_vocab.lookups) == 1
|
||||
assert len(new_vocab.lookups) == len(vocab.lookups)
|
||||
assert table_name in new_vocab.lookups
|
||||
table = new_vocab.lookups.get_table(table_name)
|
||||
assert len(table) == 2
|
||||
|
@ -137,13 +136,12 @@ def test_lookups_to_from_disk_via_vocab():
|
|||
table_name = "test"
|
||||
vocab = Vocab()
|
||||
vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"})
|
||||
assert len(vocab.lookups) == 1
|
||||
assert table_name in vocab.lookups
|
||||
with make_tempdir() as tmpdir:
|
||||
vocab.to_disk(tmpdir)
|
||||
new_vocab = Vocab()
|
||||
new_vocab.from_disk(tmpdir)
|
||||
assert len(new_vocab.lookups) == 1
|
||||
assert len(new_vocab.lookups) == len(vocab.lookups)
|
||||
assert table_name in new_vocab.lookups
|
||||
table = new_vocab.lookups.get_table(table_name)
|
||||
assert len(table) == 2
|
||||
|
|
|
@ -329,3 +329,15 @@ def test_vocab_prune_vectors():
|
|||
neighbour, similarity = list(remap.values())[0]
|
||||
assert neighbour == "cat", remap
|
||||
assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)
|
||||
|
||||
|
||||
def test_vector_is_oov():
|
||||
vocab = Vocab(vectors_name="test_vocab_is_oov")
|
||||
data = numpy.ndarray((5, 3), dtype="f")
|
||||
data[0] = 1.0
|
||||
data[1] = 2.0
|
||||
vocab.set_vector("cat", data[0])
|
||||
vocab.set_vector("dog", data[1])
|
||||
assert vocab["cat"].is_oov is True
|
||||
assert vocab["dog"].is_oov is True
|
||||
assert vocab["hamster"].is_oov is False
|
||||
|
|
|
@ -17,7 +17,7 @@ from ..typedefs cimport hash_t
|
|||
from ..lexeme cimport Lexeme
|
||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
|
||||
from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL
|
||||
from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL
|
||||
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
|
||||
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
|
||||
from ..symbols cimport conj
|
||||
|
@ -259,7 +259,7 @@ cdef class Token:
|
|||
@property
|
||||
def prob(self):
|
||||
"""RETURNS (float): Smoothed log probability estimate of token type."""
|
||||
return self.c.lex.prob
|
||||
return self.vocab[self.c.lex.orth].prob
|
||||
|
||||
@property
|
||||
def sentiment(self):
|
||||
|
@ -267,7 +267,7 @@ cdef class Token:
|
|||
negativity of the token."""
|
||||
if "sentiment" in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks["sentiment"](self)
|
||||
return self.c.lex.sentiment
|
||||
return self.vocab[self.c.lex.orth].sentiment
|
||||
|
||||
@property
|
||||
def lang(self):
|
||||
|
@ -286,7 +286,7 @@ cdef class Token:
|
|||
@property
|
||||
def cluster(self):
|
||||
"""RETURNS (int): Brown cluster ID."""
|
||||
return self.c.lex.cluster
|
||||
return self.vocab[self.c.lex.orth].cluster
|
||||
|
||||
@property
|
||||
def orth(self):
|
||||
|
@ -923,7 +923,7 @@ cdef class Token:
|
|||
@property
|
||||
def is_oov(self):
|
||||
"""RETURNS (bool): Whether the token is out-of-vocabulary."""
|
||||
return Lexeme.c_check_flag(self.c.lex, IS_OOV)
|
||||
return self.c.lex.orth in self.vocab.vectors
|
||||
|
||||
@property
|
||||
def is_stop(self):
|
||||
|
|
|
@ -30,6 +30,7 @@ cdef class Vocab:
|
|||
cpdef public Morphology morphology
|
||||
cpdef public object vectors
|
||||
cpdef public object lookups
|
||||
cpdef public object lookups_extra
|
||||
cdef readonly int length
|
||||
cdef public object data_dir
|
||||
cdef public object lex_attr_getters
|
||||
|
|
134
spacy/vocab.pyx
134
spacy/vocab.pyx
|
@ -11,8 +11,7 @@ from .lexeme cimport EMPTY_LEXEME, OOV_RANK
|
|||
from .lexeme cimport Lexeme
|
||||
from .typedefs cimport attr_t
|
||||
from .tokens.token cimport Token
|
||||
from .attrs cimport PROB, LANG, ORTH, TAG, POS
|
||||
from .structs cimport SerializedLexemeC
|
||||
from .attrs cimport LANG, ORTH, TAG, POS
|
||||
|
||||
from .compat import copy_reg, basestring_
|
||||
from .errors import Errors
|
||||
|
@ -22,6 +21,8 @@ from .vectors import Vectors
|
|||
from ._ml import link_vectors_to_models
|
||||
from .lookups import Lookups
|
||||
from . import util
|
||||
from .lang.norm_exceptions import BASE_NORMS
|
||||
from .lang.lex_attrs import LEX_ATTRS
|
||||
|
||||
|
||||
cdef class Vocab:
|
||||
|
@ -32,8 +33,8 @@ cdef class Vocab:
|
|||
DOCS: https://spacy.io/api/vocab
|
||||
"""
|
||||
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
||||
strings=tuple(), lookups=None, oov_prob=-20., vectors_name=None,
|
||||
**deprecated_kwargs):
|
||||
strings=tuple(), lookups=None, lookups_extra=None,
|
||||
oov_prob=-20., vectors_name=None, **deprecated_kwargs):
|
||||
"""Create the vocabulary.
|
||||
|
||||
lex_attr_getters (dict): A dictionary mapping attribute IDs to
|
||||
|
@ -44,6 +45,7 @@ cdef class Vocab:
|
|||
strings (StringStore): StringStore that maps strings to integers, and
|
||||
vice versa.
|
||||
lookups (Lookups): Container for large lookup tables and dictionaries.
|
||||
lookups_extra (Lookups): Container for optional lookup tables and dictionaries.
|
||||
name (unicode): Optional name to identify the vectors table.
|
||||
RETURNS (Vocab): The newly constructed object.
|
||||
"""
|
||||
|
@ -51,8 +53,12 @@ cdef class Vocab:
|
|||
tag_map = tag_map if tag_map is not None else {}
|
||||
if lookups in (None, True, False):
|
||||
lookups = Lookups()
|
||||
if "lexeme_norm" not in lookups:
|
||||
lookups.add_table("lexeme_norm")
|
||||
if lemmatizer in (None, True, False):
|
||||
lemmatizer = Lemmatizer(lookups)
|
||||
if lookups_extra in (None, True, False):
|
||||
lookups_extra = Lookups()
|
||||
self.cfg = {'oov_prob': oov_prob}
|
||||
self.mem = Pool()
|
||||
self._by_orth = PreshMap()
|
||||
|
@ -65,6 +71,7 @@ cdef class Vocab:
|
|||
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
||||
self.vectors = Vectors(name=vectors_name)
|
||||
self.lookups = lookups
|
||||
self.lookups_extra = lookups_extra
|
||||
|
||||
@property
|
||||
def lang(self):
|
||||
|
@ -173,9 +180,7 @@ cdef class Vocab:
|
|||
value = func(string)
|
||||
if isinstance(value, unicode):
|
||||
value = self.strings.add(value)
|
||||
if attr == PROB:
|
||||
lex.prob = value
|
||||
elif value is not None:
|
||||
if value is not None:
|
||||
Lexeme.set_struct_attr(lex, attr, value)
|
||||
if not is_oov:
|
||||
self._add_lex_to_vocab(lex.orth, lex)
|
||||
|
@ -435,17 +440,16 @@ cdef class Vocab:
|
|||
path = util.ensure_path(path)
|
||||
if not path.exists():
|
||||
path.mkdir()
|
||||
setters = ["strings", "lexemes", "vectors"]
|
||||
setters = ["strings", "vectors"]
|
||||
exclude = util.get_serialization_exclude(setters, exclude, kwargs)
|
||||
if "strings" not in exclude:
|
||||
self.strings.to_disk(path / "strings.json")
|
||||
if "lexemes" not in exclude:
|
||||
with (path / "lexemes.bin").open("wb") as file_:
|
||||
file_.write(self.lexemes_to_bytes())
|
||||
if "vectors" not in "exclude" and self.vectors is not None:
|
||||
self.vectors.to_disk(path)
|
||||
if "lookups" not in "exclude" and self.lookups is not None:
|
||||
self.lookups.to_disk(path)
|
||||
if "lookups_extra" not in "exclude" and self.lookups_extra is not None:
|
||||
self.lookups_extra.to_disk(path, filename="lookups_extra.bin")
|
||||
|
||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
|
@ -458,13 +462,10 @@ cdef class Vocab:
|
|||
DOCS: https://spacy.io/api/vocab#to_disk
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
getters = ["strings", "lexemes", "vectors"]
|
||||
getters = ["strings", "vectors"]
|
||||
exclude = util.get_serialization_exclude(getters, exclude, kwargs)
|
||||
if "strings" not in exclude:
|
||||
self.strings.from_disk(path / "strings.json") # TODO: add exclude?
|
||||
if "lexemes" not in exclude:
|
||||
with (path / "lexemes.bin").open("rb") as file_:
|
||||
self.lexemes_from_bytes(file_.read())
|
||||
if "vectors" not in exclude:
|
||||
if self.vectors is not None:
|
||||
self.vectors.from_disk(path, exclude=["strings"])
|
||||
|
@ -472,6 +473,14 @@ cdef class Vocab:
|
|||
link_vectors_to_models(self)
|
||||
if "lookups" not in exclude:
|
||||
self.lookups.from_disk(path)
|
||||
if "lookups_extra" not in exclude:
|
||||
self.lookups_extra.from_disk(path, filename="lookups_extra.bin")
|
||||
if "lexeme_norm" in self.lookups:
|
||||
self.lex_attr_getters[NORM] = util.add_lookups(
|
||||
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm")
|
||||
)
|
||||
self.length = 0
|
||||
self._by_orth = PreshMap()
|
||||
return self
|
||||
|
||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||
|
@ -490,9 +499,9 @@ cdef class Vocab:
|
|||
|
||||
getters = OrderedDict((
|
||||
("strings", lambda: self.strings.to_bytes()),
|
||||
("lexemes", lambda: self.lexemes_to_bytes()),
|
||||
("vectors", deserialize_vectors),
|
||||
("lookups", lambda: self.lookups.to_bytes())
|
||||
("lookups", lambda: self.lookups.to_bytes()),
|
||||
("lookups_extra", lambda: self.lookups_extra.to_bytes())
|
||||
))
|
||||
exclude = util.get_serialization_exclude(getters, exclude, kwargs)
|
||||
return util.to_bytes(getters, exclude)
|
||||
|
@ -514,99 +523,62 @@ cdef class Vocab:
|
|||
|
||||
setters = OrderedDict((
|
||||
("strings", lambda b: self.strings.from_bytes(b)),
|
||||
("lexemes", lambda b: self.lexemes_from_bytes(b)),
|
||||
("vectors", lambda b: serialize_vectors(b)),
|
||||
("lookups", lambda b: self.lookups.from_bytes(b))
|
||||
("lookups", lambda b: self.lookups.from_bytes(b)),
|
||||
("lookups_extra", lambda b: self.lookups_extra.from_bytes(b))
|
||||
))
|
||||
exclude = util.get_serialization_exclude(setters, exclude, kwargs)
|
||||
util.from_bytes(bytes_data, setters, exclude)
|
||||
if "lexeme_norm" in self.lookups:
|
||||
self.lex_attr_getters[NORM] = util.add_lookups(
|
||||
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm")
|
||||
)
|
||||
self.length = 0
|
||||
self._by_orth = PreshMap()
|
||||
if self.vectors.name is not None:
|
||||
link_vectors_to_models(self)
|
||||
return self
|
||||
|
||||
def lexemes_to_bytes(self):
|
||||
cdef hash_t key
|
||||
cdef size_t addr
|
||||
cdef LexemeC* lexeme = NULL
|
||||
cdef SerializedLexemeC lex_data
|
||||
cdef int size = 0
|
||||
for key, addr in self._by_orth.items():
|
||||
if addr == 0:
|
||||
continue
|
||||
size += sizeof(lex_data.data)
|
||||
byte_string = b"\0" * size
|
||||
byte_ptr = <unsigned char*>byte_string
|
||||
cdef int j
|
||||
cdef int i = 0
|
||||
for key, addr in self._by_orth.items():
|
||||
if addr == 0:
|
||||
continue
|
||||
lexeme = <LexemeC*>addr
|
||||
lex_data = Lexeme.c_to_bytes(lexeme)
|
||||
for j in range(sizeof(lex_data.data)):
|
||||
byte_ptr[i] = lex_data.data[j]
|
||||
i += 1
|
||||
return byte_string
|
||||
|
||||
def lexemes_from_bytes(self, bytes bytes_data):
|
||||
"""Load the binary vocabulary data from the given string."""
|
||||
cdef LexemeC* lexeme
|
||||
cdef hash_t key
|
||||
cdef unicode py_str
|
||||
cdef int i = 0
|
||||
cdef int j = 0
|
||||
cdef SerializedLexemeC lex_data
|
||||
chunk_size = sizeof(lex_data.data)
|
||||
cdef void* ptr
|
||||
cdef unsigned char* bytes_ptr = bytes_data
|
||||
for i in range(0, len(bytes_data), chunk_size):
|
||||
lexeme = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
|
||||
for j in range(sizeof(lex_data.data)):
|
||||
lex_data.data[j] = bytes_ptr[i+j]
|
||||
Lexeme.c_from_bytes(lexeme, lex_data)
|
||||
prev_entry = self._by_orth.get(lexeme.orth)
|
||||
if prev_entry != NULL:
|
||||
memcpy(prev_entry, lexeme, sizeof(LexemeC))
|
||||
continue
|
||||
ptr = self.strings._map.get(lexeme.orth)
|
||||
if ptr == NULL:
|
||||
continue
|
||||
py_str = self.strings[lexeme.orth]
|
||||
if self.strings[py_str] != lexeme.orth:
|
||||
raise ValueError(Errors.E086.format(string=py_str,
|
||||
orth_id=lexeme.orth,
|
||||
hash_id=self.strings[py_str]))
|
||||
self._by_orth.set(lexeme.orth, lexeme)
|
||||
self.length += 1
|
||||
|
||||
def _reset_cache(self, keys, strings):
|
||||
# I'm not sure this made sense. Disable it for now.
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def load_extra_lookups(self, table_name):
|
||||
if table_name not in self.lookups_extra:
|
||||
if self.lang + "_extra" in util.registry.lookups:
|
||||
tables = util.registry.lookups.get(self.lang + "_extra")
|
||||
for name, filename in tables.items():
|
||||
if table_name == name:
|
||||
data = util.load_language_data(filename)
|
||||
self.lookups_extra.add_table(name, data)
|
||||
if table_name not in self.lookups_extra:
|
||||
self.lookups_extra.add_table(table_name)
|
||||
return self.lookups_extra.get_table(table_name)
|
||||
|
||||
|
||||
def pickle_vocab(vocab):
|
||||
sstore = vocab.strings
|
||||
vectors = vocab.vectors
|
||||
morph = vocab.morphology
|
||||
length = vocab.length
|
||||
data_dir = vocab.data_dir
|
||||
lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
|
||||
lexemes_data = vocab.lexemes_to_bytes()
|
||||
lookups = vocab.lookups
|
||||
lookups_extra = vocab.lookups_extra
|
||||
return (unpickle_vocab,
|
||||
(sstore, vectors, morph, data_dir, lex_attr_getters, lexemes_data, length))
|
||||
(sstore, vectors, morph, data_dir, lex_attr_getters, lookups, lookups_extra))
|
||||
|
||||
|
||||
def unpickle_vocab(sstore, vectors, morphology, data_dir,
|
||||
lex_attr_getters, bytes lexemes_data, int length):
|
||||
lex_attr_getters, lookups, lookups_extra):
|
||||
cdef Vocab vocab = Vocab()
|
||||
vocab.length = length
|
||||
vocab.vectors = vectors
|
||||
vocab.strings = sstore
|
||||
vocab.morphology = morphology
|
||||
vocab.data_dir = data_dir
|
||||
vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
|
||||
vocab.lexemes_from_bytes(lexemes_data)
|
||||
vocab.length = length
|
||||
vocab.lookups = lookups
|
||||
vocab.lookups_extra = lookups_extra
|
||||
return vocab
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user