1
1
mirror of https://github.com/explosion/spaCy.git synced 2025-04-06 10:14:13 +03:00

Reduce size of language data ()

* Move Turkish lemmas to a json file

Rather than a large dict in Python source, the data is now a big json
file. This includes a method for loading the json file, falling back to
a compressed file, and an update to MANIFEST.in that excludes json in
the spacy/lang directory.

This focuses on Turkish specifically because it has the most language
data in core.

* Transition all lemmatizer.py files to json

This covers all lemmatizer.py files of a significant size (>500k or so).
Small files were left alone.

None of the affected files have logic, so this was pretty
straightforward.

One unusual thing is that the lemma data for Urdu doesn't seem to be
used anywhere. That may require further investigation.

* Move large lang data to json for fr/nb/nl/sv

These are the languages that use a lemmatizer directory (rather than a
single file) and are larger than English.

For most of these languages there were many language data files, in
which case only the large ones (>500k or so) were converted to json. It
may or may not be a good idea to migrate the remaining Python files to
json in the future.

* Fix id lemmas.json

The contents of this file were originally just copied from the Python
source, but that used single quotes, so it had to be properly converted
to json first.

* Add .json.gz to gitignore

This covers the json.gz files built as part of distribution.

* Add language data gzip to build process

Currently this gzip data on every build; it works, but it should be
changed to only gzip when the source file has been updated.

* Remove Danish lemmatizer.py

Missed this when I added the json.

* Update to match latest 

The way gzipped json is loaded/saved in srsly changed a bit.

* Only compress language data if necessary

If a .json.gz file exists and is newer than the corresponding json file,
it's not recompressed.

* Move en/el language data to json

This only affected files >500kb, which was nouns for both languages and
the generic lookup table for English.

* Remove empty files in Norwegian tokenizer

It's unclear why, but the Norwegian (nb) tokenizer had empty files for
adj/adv/noun/verb lemmas. This may have been a result of copying the
structure of the English lemmatizer.

This removed the files, but still creates the empty sets in the
lemmatizer. That may not actually be necessary.

* Remove dubious entries in English lookup.json

" furthest" and " skilled" - both prefixed with a space - were in the
English lookup table. That seems obviously wrong so I have removed them.

* Fix small issues with en/fr lemmatizers

The en tokenizer was including the removed _nouns.py file, so that's
removed.

The fr tokenizer is unusual in that it has a lemmatizer directory with
both __init__.py and lemmatizer.py. lemmatizer.py had not been converted
to load the json language data, so that was fixed.

* Auto-format

* Auto-format

* Update srsly pin

* Consistently use pathlib paths
This commit is contained in:
Paul O'Leary McCann 2019-08-20 21:54:11 +09:00 committed by Matthew Honnibal
parent 072860fcd0
commit 756b66b7c0
59 changed files with 1519384 additions and 1401653 deletions

View File

@ -3,3 +3,5 @@ include LICENSE
include README.md
include pyproject.toml
include bin/spacy
recursive-exclude spacy/lang *.json
recursive-include spacy/lang *.json.gz

View File

@ -5,7 +5,7 @@ thinc>=7.0.8,<7.1.0
blis>=0.2.2,<0.3.0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.2.0,<1.1.0
srsly>=0.0.6,<1.1.0
srsly>=0.1.0,<1.1.0
# Third party dependencies
numpy>=1.15.0
requests>=2.13.0,<3.0.0

View File

@ -132,6 +132,23 @@ def generate_cython(root, source):
raise RuntimeError("Running cythonize failed")
def gzip_language_data(root, source):
print("Compressing language data")
import srsly
from pathlib import Path
base = Path(root) / source
for jsonfile in base.glob("**/*.json"):
outfile = jsonfile.with_suffix(jsonfile.suffix + ".gz")
if outfile.is_file() and outfile.stat().st_ctime > jsonfile.stat().st_ctime:
# If the gz is newer it doesn't need updating
print("Skipping {}, already compressed".format(jsonfile))
continue
data = srsly.read_json(jsonfile)
srsly.write_gzip_json(outfile, data)
print("Compressed {}".format(jsonfile))
def is_source_release(path):
return os.path.exists(os.path.join(path, "PKG-INFO"))
@ -207,6 +224,7 @@ def setup_package():
if not is_source_release(root):
generate_cython(root, "spacy")
gzip_language_data(root, "spacy/lang")
setup(
name="spacy",
@ -233,7 +251,7 @@ def setup_package():
"plac<1.0.0,>=0.9.6",
"requests>=2.13.0,<3.0.0",
"wasabi>=0.2.0,<1.1.0",
"srsly>=0.0.6,<1.1.0",
"srsly>=0.1.0,<1.1.0",
'pathlib==1.0.1; python_version < "3.4"',
],
setup_requires=["wheel"],

View File

@ -1,16 +1,17 @@
# coding: utf8
from __future__ import unicode_literals
from pathlib import Path
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, add_lookups, load_language_data
from .punctuation import TOKENIZER_INFIXES
@ -24,7 +25,8 @@ class CatalanDefaults(Language.Defaults):
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
lemma_lookup = LOOKUP
lemma_path = Path(__file__).parent / "lemmas.json"
lemma_lookup = load_language_data(lemma_path)
infixes = TOKENIZER_INFIXES

View File

@ -1,8 +1,4 @@
# coding: utf8
from __future__ import unicode_literals
LOOKUP = {
{
"as": "a",
"àbacs": "àbac",
"abacàs": "abacà",

View File

@ -1,6 +1,8 @@
# coding: utf8
from __future__ import unicode_literals
from pathlib import Path
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
@ -8,13 +10,12 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .morph_rules import MORPH_RULES
from ..tag_map import TAG_MAP
from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, add_lookups, load_language_data
class DanishDefaults(Language.Defaults):
@ -30,7 +31,8 @@ class DanishDefaults(Language.Defaults):
suffixes = TOKENIZER_SUFFIXES
tag_map = TAG_MAP
stop_words = STOP_WORDS
lemma_lookup = LOOKUP
lemma_path = Path(__file__).parent / "lemmas.json"
lemma_lookup = load_language_data(lemma_path)
class Danish(Language):

View File

@ -1,7 +1,4 @@
# coding: utf8
from __future__ import unicode_literals
LOOKUP = {
{
"3d-billede": "3d-billede",
"3d-billeder": "3d-billede",
"3d-billederne": "3d-billede",

View File

@ -1,19 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
from pathlib import Path
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, add_lookups, load_language_data
class GermanDefaults(Language.Defaults):
@ -27,7 +28,8 @@ class GermanDefaults(Language.Defaults):
tag_map = TAG_MAP
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
lemma_lookup = LOOKUP
lemma_path = Path(__file__).parent / "lemmas.json"
lemma_lookup = load_language_data(lemma_path)
class German(Language):

View File

@ -1,7 +1,4 @@
# coding: utf8
from __future__ import unicode_literals
LOOKUP = {
{
"Ab": "ab",
"Aber": "aber",
"ABMs": "ABM",

View File

@ -1,16 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
from pathlib import Path
from ._adjectives import ADJECTIVES
from ._adjectives_irreg import ADJECTIVES_IRREG
from ._adverbs import ADVERBS
from ._nouns_irreg import NOUNS_IRREG
from ._dets_irreg import DETS_IRREG
from ._verbs_irreg import VERBS_IRREG
from ._nouns import NOUNS
from ._verbs import VERBS
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES
from ....util import load_language_data
NOUNS = load_language_data(Path(__file__).parent / "_nouns.json")
LEMMA_INDEX = {"adj": ADJECTIVES, "adv": ADVERBS, "noun": NOUNS, "verb": VERBS}

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -1,17 +1,21 @@
# coding: utf8
from __future__ import unicode_literals
from .lookup import LOOKUP # noqa: F401
from pathlib import Path
from ._adjectives import ADJECTIVES
from ._adjectives_irreg import ADJECTIVES_IRREG
from ._adverbs import ADVERBS
from ._adverbs_irreg import ADVERBS_IRREG
from ._nouns import NOUNS
from ._nouns_irreg import NOUNS_IRREG
from ._verbs import VERBS
from ._verbs_irreg import VERBS_IRREG
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES
from ....util import load_language_data
LOOKUP = load_language_data(Path(__file__).parent / "lookup.json")
NOUNS = load_language_data(Path(__file__).parent / "_nouns.json")
LEMMA_INDEX = {"adj": ADJECTIVES, "adv": ADVERBS, "noun": NOUNS, "verb": VERBS}

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +1,4 @@
# coding: utf8
from __future__ import unicode_literals
LOOKUP = {
" furtherst": "further",
" skilled": "skill",
{
"'cause": "because",
"'d": "would",
"'em": "them",

View File

@ -1,18 +1,19 @@
# coding: utf8
from __future__ import unicode_literals
from pathlib import Path
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, add_lookups, load_language_data
class SpanishDefaults(Language.Defaults):
@ -26,7 +27,8 @@ class SpanishDefaults(Language.Defaults):
tag_map = TAG_MAP
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
lemma_lookup = LOOKUP
lemma_path = Path(__file__).parent / "lemmas.json"
lemma_lookup = load_language_data(lemma_path)
class Spanish(Language):

View File

@ -1,8 +1,4 @@
# coding: utf8
from __future__ import unicode_literals
LOOKUP = {
{
"aba": "abar",
"ababa": "abar",
"ababais": "abar",

View File

@ -1,9 +1,9 @@
# coding: utf8
from __future__ import unicode_literals
from .lookup import LOOKUP
from pathlib import Path
from ._adjectives import ADJECTIVES
from ._adjectives_irreg import ADJECTIVES_IRREG
from ._adp_irreg import ADP_IRREG
from ._adverbs import ADVERBS
from ._auxiliary_verbs_irreg import AUXILIARY_VERBS_IRREG
@ -15,8 +15,14 @@ from ._nouns_irreg import NOUNS_IRREG
from ._pronouns_irreg import PRONOUNS_IRREG
from ._sconj_irreg import SCONJ_IRREG
from ._verbs import VERBS
from ._verbs_irreg import VERBS_IRREG
from ....util import load_language_data
BASE_PATH = Path(__file__).parent
LOOKUP = load_language_data(BASE_PATH / 'lookup.json')
VERBS_IRREG = load_language_data(BASE_PATH / '_verbs_irreg.json')
ADJECTIVES_IRREG = load_language_data(BASE_PATH / '_adjectives_irreg.json')
LEMMA_INDEX = {'adj': ADJECTIVES, 'adv': ADVERBS, 'noun': NOUNS, 'verb': VERBS}

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +1,13 @@
# coding: utf8
from __future__ import unicode_literals
from pathlib import Path
from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP, SCONJ, CCONJ
from ....symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
from .lookup import LOOKUP
from ....util import load_language_data
LOOKUP = load_language_data(Path(__file__).parent / 'lookup.json')
'''
French language lemmatizer applies the default rule based lemmatization

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -1,16 +1,17 @@
# coding: utf8
from __future__ import unicode_literals
from pathlib import Path
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, add_lookups, load_language_data
class HungarianDefaults(Language.Defaults):
@ -25,7 +26,8 @@ class HungarianDefaults(Language.Defaults):
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
token_match = TOKEN_MATCH
lemma_lookup = LOOKUP
lemma_path = Path(__file__).parent / "lemmas.json"
lemma_lookup = load_language_data(lemma_path)
class Hungarian(Language):

View File

@ -1,8 +1,4 @@
# coding: utf8
from __future__ import unicode_literals
LOOKUP = {
{
"Aaronsonnak": "Aaronson",
"Aaronsont": "Aaronson",
"Abbáziában": "Abbázia",

View File

@ -1,11 +1,12 @@
# coding: utf8
from __future__ import unicode_literals
from pathlib import Path
from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .lemmatizer import LOOKUP
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .tag_map import TAG_MAP
@ -14,7 +15,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, add_lookups, load_language_data
class IndonesianDefaults(Language.Defaults):
@ -30,7 +31,8 @@ class IndonesianDefaults(Language.Defaults):
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS
lemma_lookup = LOOKUP
lemma_path = Path(__file__).parent / "lemmas.json"
lemma_lookup = load_language_data(lemma_path)
tag_map = TAG_MAP

36874
spacy/lang/id/lemmas.json Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,8 +1,9 @@
# coding: utf8
from __future__ import unicode_literals
from pathlib import Path
from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from .tag_map import TAG_MAP
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -10,7 +11,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, add_lookups, load_language_data
from .punctuation import TOKENIZER_INFIXES
@ -23,7 +24,8 @@ class ItalianDefaults(Language.Defaults):
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
lemma_lookup = LOOKUP
lemma_path = Path(__file__).parent / "lemmas.json"
lemma_lookup = load_language_data(lemma_path)
tag_map = TAG_MAP
infixes = TOKENIZER_INFIXES

View File

@ -1,8 +1,4 @@
# coding: utf8
from __future__ import unicode_literals
LOOKUP = {
{
"'ndranghete": "'ndrangheta",
"'ndrine": "'ndrina",
"a-storica": "a-storico",

View File

@ -1,18 +1,19 @@
# coding: utf8
from __future__ import unicode_literals
from pathlib import Path
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .tag_map import TAG_MAP
from .lemmatizer import LOOKUP
from .morph_rules import MORPH_RULES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, add_lookups, load_language_data
def _return_lt(_):
@ -31,7 +32,8 @@ class LithuanianDefaults(Language.Defaults):
stop_words = STOP_WORDS
tag_map = TAG_MAP
morph_rules = MORPH_RULES
lemma_lookup = LOOKUP
lemma_path = Path(__file__).parent / "lemmas.json"
lemma_lookup = load_language_data(lemma_path)
class Lithuanian(Language):

View File

@ -1,8 +1,4 @@
# coding: utf8
from __future__ import unicode_literals
LOOKUP = {
{
"Abchazas": "abchazas",
"Abchazų": "abchazas",
"abchazų": "abchazas",

View File

@ -3,24 +3,27 @@
# structure copied from the English lemmatizer
from __future__ import unicode_literals
from pathlib import Path
from .lookup import LOOKUP
from ._adjectives_wordforms import ADJECTIVES_WORDFORMS
from ._adverbs_wordforms import ADVERBS_WORDFORMS
from ._nouns_wordforms import NOUNS_WORDFORMS
from ._verbs_wordforms import VERBS_WORDFORMS
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES
from ._verbs import VERBS
from ._nouns import NOUNS
from ._adjectives import ADJECTIVES
from ._adverbs import ADVERBS
from ....util import load_language_data
ADJECTIVES = set()
ADVERBS = set()
NOUNS = set()
VERBS = set()
LEMMA_INDEX = {"adj": ADJECTIVES, "adv": ADVERBS, "noun": NOUNS, "verb": VERBS}
BASE_PATH = Path(__file__).parent
LEMMA_EXC = {
"adj": ADJECTIVES_WORDFORMS,
"adj": load_language_data(BASE_PATH / '_adjectives_wordforms.json'),
"adv": ADVERBS_WORDFORMS,
"noun": NOUNS_WORDFORMS,
"verb": VERBS_WORDFORMS,
"noun": load_language_data(BASE_PATH / '_nouns_wordforms.json'),
"verb": load_language_data(BASE_PATH / '_verbs_wordforms.json'),
}
LEMMA_RULES = {
@ -29,3 +32,12 @@ LEMMA_RULES = {
"verb": VERB_RULES,
"punct": PUNCT_RULES,
}
# Note on noun wordforms / lemmas:
# All wordforms are extracted from Norsk Ordbank in Norwegian Bokmål 2005, updated 20180627
# (CLARINO NB - Språkbanken), Nasjonalbiblioteket, Norway:
# https://www.nb.no/sprakbanken/show?serial=oai%3Anb.no%3Asbr-5&lang=en
# License:
# Creative_Commons-BY (CC-BY) (https://creativecommons.org/licenses/by/4.0/)

View File

@ -1,8 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
ADJECTIVES = set(
"""
""".split()
)

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -1,8 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
ADVERBS = set(
"""
""".split()
)

View File

@ -1,8 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
NOUNS = set(
"""
""".split()
)

View File

@ -1,8 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
VERBS = set(
"""
""".split()
)

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,8 @@
# coding: utf8
from __future__ import unicode_literals
from pathlib import Path
from ._verbs_irreg import VERBS_IRREG
from ._nouns_irreg import NOUNS_IRREG
from ._adjectives_irreg import ADJECTIVES_IRREG
@ -11,16 +13,20 @@ from ._determiners_irreg import DETERMINERS_IRREG
from ._pronouns_irreg import PRONOUNS_IRREG
from ._verbs import VERBS
from ._nouns import NOUNS
from ._adjectives import ADJECTIVES
from ._adpositions import ADPOSITIONS
from ._determiners import DETERMINERS
from .lookup import LOOKUP
from ._lemma_rules import RULES
from .lemmatizer import DutchLemmatizer
from ....util import load_language_data
BASE_PATH = Path(__file__).parent
LOOKUP = load_language_data(BASE_PATH / "lookup.json")
NOUNS = load_language_data(BASE_PATH / "_nouns.json")
LEMMA_INDEX = {
"adj": ADJECTIVES,

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -1,10 +1,11 @@
# coding: utf8
from __future__ import unicode_literals
from pathlib import Path
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import LOOKUP
from .tag_map import TAG_MAP
from .norm_exceptions import NORM_EXCEPTIONS
@ -13,7 +14,7 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, add_lookups, load_language_data
class PortugueseDefaults(Language.Defaults):
@ -25,7 +26,8 @@ class PortugueseDefaults(Language.Defaults):
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
lemma_lookup = LOOKUP
lemma_path = Path(__file__).parent / "lemmas.json"
lemma_lookup = load_language_data(lemma_path)
tag_map = TAG_MAP
infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES

View File

@ -1,8 +1,4 @@
# coding: utf8
from __future__ import unicode_literals
LOOKUP = {
{
"Abris": "abril",
"Agostos": "agosto",
"Cérberos": "cérbero",

View File

@ -1,15 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
from pathlib import Path
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, add_lookups, load_language_data
# Lemma data note:
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
# Replaced characters using cedillas with the correct ones (ș and ț)
class RomanianDefaults(Language.Defaults):
@ -20,7 +25,8 @@ class RomanianDefaults(Language.Defaults):
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
lemma_lookup = LOOKUP
lemma_path = Path(__file__).parent / "lemmas.json"
lemma_lookup = load_language_data(lemma_path)
class Romanian(Language):

View File

@ -1,10 +1,4 @@
# coding: utf8
from __future__ import unicode_literals
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
# Replaced characters using cedillas with the correct ones (ș and ț)
LOOKUP = {
{
"abale": "aba",
"abalei": "aba",
"abalele": "aba",

View File

@ -1,8 +1,12 @@
# coding: utf8
from __future__ import unicode_literals
from .lookup import LOOKUP # noqa: F401
from pathlib import Path
from ....util import load_language_data
lemma_path = Path(__file__).parent / "lookup.json"
LOOKUP = load_language_data(lemma_path)
LEMMA_RULES = {
"noun": [

View File

@ -1,8 +1,4 @@
# coding: utf8
from __future__ import unicode_literals
LOOKUP = {
{
"A-bombens": "A-bomb",
"A-bomberna": "A-bomb",
"A-bombers": "A-bomb",

View File

@ -1,15 +1,19 @@
# coding: utf8
from __future__ import unicode_literals
from pathlib import Path
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lemmatizer import LOOKUP
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, add_lookups, load_language_data
# Lemma data source:
# http://st2.zargan.com/duyuru/Zargan_Linguistic_Resources_for_Turkish.html - Bilgin, O. (2016). Biçimbilimsel Bakımdan Karmaşık Türkçe Kelimelerin İşlenmesinde Frekans Etkileri (yayınlanmamış yüksek lisans tezi). Boğaziçi Üniversitesi, İstanbul. Erişim: http://st2.zargan.com/public/resources/turkish/frequency_effects_in_turkish.pdf
class TurkishDefaults(Language.Defaults):
@ -20,7 +24,8 @@ class TurkishDefaults(Language.Defaults):
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
lemma_lookup = LOOKUP
lemma_path = Path(__file__).parent / "lemmas.json"
lemma_lookup = load_language_data(lemma_path)
class Turkish(Language):

View File

@ -1,10 +1,4 @@
# coding: utf8
from __future__ import unicode_literals
# Source: http://st2.zargan.com/duyuru/Zargan_Linguistic_Resources_for_Turkish.html - Bilgin, O. (2016). Biçimbilimsel Bakımdan Karmaşık Türkçe Kelimelerin İşlenmesinde Frekans Etkileri (yayınlanmamış yüksek lisans tezi). Boğaziçi Üniversitesi, İstanbul. Erişim: http://st2.zargan.com/public/resources/turkish/frequency_effects_in_turkish.pdf
LOOKUP = {
{
"çöğürü": "çöğür",
"çöğüründen": "çöğür",
"çöğür": "çöğür",

29103
spacy/lang/ur/lemmas.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -117,6 +117,19 @@ def ensure_path(path):
return path
def load_language_data(path):
"""Load JSON language data using the given path as a base.
If the provided path isn't present, will attempt to load a gzipped version
before giving up.
"""
try:
return srsly.read_json(path)
except FileNotFoundError:
return srsly.read_gzip_json(path + ".gz")
def load_model(name, **overrides):
"""Load a model from a shortcut link, package or data path.