mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-06 10:14:13 +03:00
Reduce size of language data (#4141)
* Move Turkish lemmas to a json file Rather than a large dict in Python source, the data is now a big json file. This includes a method for loading the json file, falling back to a compressed file, and an update to MANIFEST.in that excludes json in the spacy/lang directory. This focuses on Turkish specifically because it has the most language data in core. * Transition all lemmatizer.py files to json This covers all lemmatizer.py files of a significant size (>500k or so). Small files were left alone. None of the affected files have logic, so this was pretty straightforward. One unusual thing is that the lemma data for Urdu doesn't seem to be used anywhere. That may require further investigation. * Move large lang data to json for fr/nb/nl/sv These are the languages that use a lemmatizer directory (rather than a single file) and are larger than English. For most of these languages there were many language data files, in which case only the large ones (>500k or so) were converted to json. It may or may not be a good idea to migrate the remaining Python files to json in the future. * Fix id lemmas.json The contents of this file were originally just copied from the Python source, but that used single quotes, so it had to be properly converted to json first. * Add .json.gz to gitignore This covers the json.gz files built as part of distribution. * Add language data gzip to build process Currently this gzip data on every build; it works, but it should be changed to only gzip when the source file has been updated. * Remove Danish lemmatizer.py Missed this when I added the json. * Update to match latest explosion/srsly#9 The way gzipped json is loaded/saved in srsly changed a bit. * Only compress language data if necessary If a .json.gz file exists and is newer than the corresponding json file, it's not recompressed. * Move en/el language data to json This only affected files >500kb, which was nouns for both languages and the generic lookup table for English. * Remove empty files in Norwegian tokenizer It's unclear why, but the Norwegian (nb) tokenizer had empty files for adj/adv/noun/verb lemmas. This may have been a result of copying the structure of the English lemmatizer. This removed the files, but still creates the empty sets in the lemmatizer. That may not actually be necessary. * Remove dubious entries in English lookup.json " furthest" and " skilled" - both prefixed with a space - were in the English lookup table. That seems obviously wrong so I have removed them. * Fix small issues with en/fr lemmatizers The en tokenizer was including the removed _nouns.py file, so that's removed. The fr tokenizer is unusual in that it has a lemmatizer directory with both __init__.py and lemmatizer.py. lemmatizer.py had not been converted to load the json language data, so that was fixed. * Auto-format * Auto-format * Update srsly pin * Consistently use pathlib paths
This commit is contained in:
parent
072860fcd0
commit
756b66b7c0
MANIFEST.inrequirements.txtsetup.pyutil.py
spacy
lang
ca
da
de
el/lemmatizer
en/lemmatizer
es
fr/lemmatizer
__init__.py_adjectives_irreg.json_adjectives_irreg.py_verbs_irreg.json_verbs_irreg.pylemmatizer.pylookup.jsonlookup.py
hu
id
it
lt
nb/lemmatizer
__init__.py_adjectives.py_adjectives_wordforms.json_adjectives_wordforms.py_adverbs.py_nouns.py_nouns_wordforms.json_verbs.py_verbs_wordforms.json_verbs_wordforms.py
nl/lemmatizer
pt
ro
sv/lemmatizer
tr
ur
|
@ -3,3 +3,5 @@ include LICENSE
|
|||
include README.md
|
||||
include pyproject.toml
|
||||
include bin/spacy
|
||||
recursive-exclude spacy/lang *.json
|
||||
recursive-include spacy/lang *.json.gz
|
||||
|
|
|
@ -5,7 +5,7 @@ thinc>=7.0.8,<7.1.0
|
|||
blis>=0.2.2,<0.3.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.2.0,<1.1.0
|
||||
srsly>=0.0.6,<1.1.0
|
||||
srsly>=0.1.0,<1.1.0
|
||||
# Third party dependencies
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
|
|
20
setup.py
20
setup.py
|
@ -132,6 +132,23 @@ def generate_cython(root, source):
|
|||
raise RuntimeError("Running cythonize failed")
|
||||
|
||||
|
||||
def gzip_language_data(root, source):
|
||||
print("Compressing language data")
|
||||
import srsly
|
||||
from pathlib import Path
|
||||
|
||||
base = Path(root) / source
|
||||
for jsonfile in base.glob("**/*.json"):
|
||||
outfile = jsonfile.with_suffix(jsonfile.suffix + ".gz")
|
||||
if outfile.is_file() and outfile.stat().st_ctime > jsonfile.stat().st_ctime:
|
||||
# If the gz is newer it doesn't need updating
|
||||
print("Skipping {}, already compressed".format(jsonfile))
|
||||
continue
|
||||
data = srsly.read_json(jsonfile)
|
||||
srsly.write_gzip_json(outfile, data)
|
||||
print("Compressed {}".format(jsonfile))
|
||||
|
||||
|
||||
def is_source_release(path):
|
||||
return os.path.exists(os.path.join(path, "PKG-INFO"))
|
||||
|
||||
|
@ -207,6 +224,7 @@ def setup_package():
|
|||
|
||||
if not is_source_release(root):
|
||||
generate_cython(root, "spacy")
|
||||
gzip_language_data(root, "spacy/lang")
|
||||
|
||||
setup(
|
||||
name="spacy",
|
||||
|
@ -233,7 +251,7 @@ def setup_package():
|
|||
"plac<1.0.0,>=0.9.6",
|
||||
"requests>=2.13.0,<3.0.0",
|
||||
"wasabi>=0.2.0,<1.1.0",
|
||||
"srsly>=0.0.6,<1.1.0",
|
||||
"srsly>=0.1.0,<1.1.0",
|
||||
'pathlib==1.0.1; python_version < "3.4"',
|
||||
],
|
||||
setup_requires=["wheel"],
|
||||
|
|
|
@ -1,16 +1,17 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import LOOKUP
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, add_lookups, load_language_data
|
||||
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
|
||||
|
@ -24,7 +25,8 @@ class CatalanDefaults(Language.Defaults):
|
|||
lex_attr_getters.update(LEX_ATTRS)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
lemma_lookup = LOOKUP
|
||||
lemma_path = Path(__file__).parent / "lemmas.json"
|
||||
lemma_lookup = load_language_data(lemma_path)
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
||||
|
||||
|
|
|
@ -1,8 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
LOOKUP = {
|
||||
{
|
||||
"as": "a",
|
||||
"àbacs": "àbac",
|
||||
"abacàs": "abacà",
|
|
@ -1,6 +1,8 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
|
@ -8,13 +10,12 @@ from .stop_words import STOP_WORDS
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .morph_rules import MORPH_RULES
|
||||
from ..tag_map import TAG_MAP
|
||||
from .lemmatizer import LOOKUP
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, add_lookups, load_language_data
|
||||
|
||||
|
||||
class DanishDefaults(Language.Defaults):
|
||||
|
@ -30,7 +31,8 @@ class DanishDefaults(Language.Defaults):
|
|||
suffixes = TOKENIZER_SUFFIXES
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
lemma_lookup = LOOKUP
|
||||
lemma_path = Path(__file__).parent / "lemmas.json"
|
||||
lemma_lookup = load_language_data(lemma_path)
|
||||
|
||||
|
||||
class Danish(Language):
|
||||
|
|
|
@ -1,7 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
LOOKUP = {
|
||||
{
|
||||
"3d-billede": "3d-billede",
|
||||
"3d-billeder": "3d-billede",
|
||||
"3d-billederne": "3d-billede",
|
|
@ -1,19 +1,20 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lemmatizer import LOOKUP
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, add_lookups, load_language_data
|
||||
|
||||
|
||||
class GermanDefaults(Language.Defaults):
|
||||
|
@ -27,7 +28,8 @@ class GermanDefaults(Language.Defaults):
|
|||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
lemma_lookup = LOOKUP
|
||||
lemma_path = Path(__file__).parent / "lemmas.json"
|
||||
lemma_lookup = load_language_data(lemma_path)
|
||||
|
||||
|
||||
class German(Language):
|
||||
|
|
|
@ -1,7 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
LOOKUP = {
|
||||
{
|
||||
"Ab": "ab",
|
||||
"Aber": "aber",
|
||||
"ABMs": "ABM",
|
|
@ -1,16 +1,20 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from ._adjectives import ADJECTIVES
|
||||
from ._adjectives_irreg import ADJECTIVES_IRREG
|
||||
from ._adverbs import ADVERBS
|
||||
from ._nouns_irreg import NOUNS_IRREG
|
||||
from ._dets_irreg import DETS_IRREG
|
||||
from ._verbs_irreg import VERBS_IRREG
|
||||
from ._nouns import NOUNS
|
||||
from ._verbs import VERBS
|
||||
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES
|
||||
|
||||
from ....util import load_language_data
|
||||
|
||||
NOUNS = load_language_data(Path(__file__).parent / "_nouns.json")
|
||||
|
||||
LEMMA_INDEX = {"adj": ADJECTIVES, "adv": ADVERBS, "noun": NOUNS, "verb": VERBS}
|
||||
|
||||
|
|
1
spacy/lang/el/lemmatizer/_nouns.json
Normal file
1
spacy/lang/el/lemmatizer/_nouns.json
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -1,17 +1,21 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .lookup import LOOKUP # noqa: F401
|
||||
from pathlib import Path
|
||||
|
||||
from ._adjectives import ADJECTIVES
|
||||
from ._adjectives_irreg import ADJECTIVES_IRREG
|
||||
from ._adverbs import ADVERBS
|
||||
from ._adverbs_irreg import ADVERBS_IRREG
|
||||
from ._nouns import NOUNS
|
||||
from ._nouns_irreg import NOUNS_IRREG
|
||||
from ._verbs import VERBS
|
||||
from ._verbs_irreg import VERBS_IRREG
|
||||
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES
|
||||
|
||||
from ....util import load_language_data
|
||||
|
||||
LOOKUP = load_language_data(Path(__file__).parent / "lookup.json")
|
||||
NOUNS = load_language_data(Path(__file__).parent / "_nouns.json")
|
||||
|
||||
LEMMA_INDEX = {"adj": ADJECTIVES, "adv": ADVERBS, "noun": NOUNS, "verb": VERBS}
|
||||
|
||||
|
|
1
spacy/lang/en/lemmatizer/_nouns.json
Normal file
1
spacy/lang/en/lemmatizer/_nouns.json
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -1,9 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
LOOKUP = {
|
||||
" furtherst": "further",
|
||||
" skilled": "skill",
|
||||
{
|
||||
"'cause": "because",
|
||||
"'d": "would",
|
||||
"'em": "them",
|
|
@ -1,18 +1,19 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import LOOKUP
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, add_lookups, load_language_data
|
||||
|
||||
|
||||
class SpanishDefaults(Language.Defaults):
|
||||
|
@ -26,7 +27,8 @@ class SpanishDefaults(Language.Defaults):
|
|||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
lemma_lookup = LOOKUP
|
||||
lemma_path = Path(__file__).parent / "lemmas.json"
|
||||
lemma_lookup = load_language_data(lemma_path)
|
||||
|
||||
|
||||
class Spanish(Language):
|
||||
|
|
|
@ -1,8 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
LOOKUP = {
|
||||
{
|
||||
"aba": "abar",
|
||||
"ababa": "abar",
|
||||
"ababais": "abar",
|
|
@ -1,9 +1,9 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .lookup import LOOKUP
|
||||
from pathlib import Path
|
||||
|
||||
from ._adjectives import ADJECTIVES
|
||||
from ._adjectives_irreg import ADJECTIVES_IRREG
|
||||
from ._adp_irreg import ADP_IRREG
|
||||
from ._adverbs import ADVERBS
|
||||
from ._auxiliary_verbs_irreg import AUXILIARY_VERBS_IRREG
|
||||
|
@ -15,8 +15,14 @@ from ._nouns_irreg import NOUNS_IRREG
|
|||
from ._pronouns_irreg import PRONOUNS_IRREG
|
||||
from ._sconj_irreg import SCONJ_IRREG
|
||||
from ._verbs import VERBS
|
||||
from ._verbs_irreg import VERBS_IRREG
|
||||
|
||||
from ....util import load_language_data
|
||||
|
||||
BASE_PATH = Path(__file__).parent
|
||||
|
||||
LOOKUP = load_language_data(BASE_PATH / 'lookup.json')
|
||||
VERBS_IRREG = load_language_data(BASE_PATH / '_verbs_irreg.json')
|
||||
ADJECTIVES_IRREG = load_language_data(BASE_PATH / '_adjectives_irreg.json')
|
||||
|
||||
LEMMA_INDEX = {'adj': ADJECTIVES, 'adv': ADVERBS, 'noun': NOUNS, 'verb': VERBS}
|
||||
|
||||
|
|
1
spacy/lang/fr/lemmatizer/_adjectives_irreg.json
Normal file
1
spacy/lang/fr/lemmatizer/_adjectives_irreg.json
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
1
spacy/lang/fr/lemmatizer/_verbs_irreg.json
Normal file
1
spacy/lang/fr/lemmatizer/_verbs_irreg.json
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -1,9 +1,13 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP, SCONJ, CCONJ
|
||||
from ....symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
||||
from .lookup import LOOKUP
|
||||
from ....util import load_language_data
|
||||
|
||||
LOOKUP = load_language_data(Path(__file__).parent / 'lookup.json')
|
||||
|
||||
'''
|
||||
French language lemmatizer applies the default rule based lemmatization
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -1,16 +1,17 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lemmatizer import LOOKUP
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, add_lookups, load_language_data
|
||||
|
||||
|
||||
class HungarianDefaults(Language.Defaults):
|
||||
|
@ -25,7 +26,8 @@ class HungarianDefaults(Language.Defaults):
|
|||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
token_match = TOKEN_MATCH
|
||||
lemma_lookup = LOOKUP
|
||||
lemma_path = Path(__file__).parent / "lemmas.json"
|
||||
lemma_lookup = load_language_data(lemma_path)
|
||||
|
||||
|
||||
class Hungarian(Language):
|
||||
|
|
|
@ -1,8 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
LOOKUP = {
|
||||
{
|
||||
"Aaronsonnak": "Aaronson",
|
||||
"Aaronsont": "Aaronson",
|
||||
"Abbáziában": "Abbázia",
|
|
@ -1,11 +1,12 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .lemmatizer import LOOKUP
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .tag_map import TAG_MAP
|
||||
|
@ -14,7 +15,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, add_lookups, load_language_data
|
||||
|
||||
|
||||
class IndonesianDefaults(Language.Defaults):
|
||||
|
@ -30,7 +31,8 @@ class IndonesianDefaults(Language.Defaults):
|
|||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
lemma_lookup = LOOKUP
|
||||
lemma_path = Path(__file__).parent / "lemmas.json"
|
||||
lemma_lookup = load_language_data(lemma_path)
|
||||
tag_map = TAG_MAP
|
||||
|
||||
|
||||
|
|
36874
spacy/lang/id/lemmas.json
Normal file
36874
spacy/lang/id/lemmas.json
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,8 +1,9 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lemmatizer import LOOKUP
|
||||
from .tag_map import TAG_MAP
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
@ -10,7 +11,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, add_lookups, load_language_data
|
||||
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
|
||||
|
@ -23,7 +24,8 @@ class ItalianDefaults(Language.Defaults):
|
|||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
lemma_lookup = LOOKUP
|
||||
lemma_path = Path(__file__).parent / "lemmas.json"
|
||||
lemma_lookup = load_language_data(lemma_path)
|
||||
tag_map = TAG_MAP
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
||||
|
|
|
@ -1,8 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
LOOKUP = {
|
||||
{
|
||||
"'ndranghete": "'ndrangheta",
|
||||
"'ndrine": "'ndrina",
|
||||
"a-storica": "a-storico",
|
|
@ -1,18 +1,19 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .tag_map import TAG_MAP
|
||||
from .lemmatizer import LOOKUP
|
||||
from .morph_rules import MORPH_RULES
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, add_lookups, load_language_data
|
||||
|
||||
|
||||
def _return_lt(_):
|
||||
|
@ -31,7 +32,8 @@ class LithuanianDefaults(Language.Defaults):
|
|||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
morph_rules = MORPH_RULES
|
||||
lemma_lookup = LOOKUP
|
||||
lemma_path = Path(__file__).parent / "lemmas.json"
|
||||
lemma_lookup = load_language_data(lemma_path)
|
||||
|
||||
|
||||
class Lithuanian(Language):
|
||||
|
|
|
@ -1,8 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
LOOKUP = {
|
||||
{
|
||||
"Abchazas": "abchazas",
|
||||
"Abchazų": "abchazas",
|
||||
"abchazų": "abchazas",
|
|
@ -3,24 +3,27 @@
|
|||
# structure copied from the English lemmatizer
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from .lookup import LOOKUP
|
||||
from ._adjectives_wordforms import ADJECTIVES_WORDFORMS
|
||||
from ._adverbs_wordforms import ADVERBS_WORDFORMS
|
||||
from ._nouns_wordforms import NOUNS_WORDFORMS
|
||||
from ._verbs_wordforms import VERBS_WORDFORMS
|
||||
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES
|
||||
from ._verbs import VERBS
|
||||
from ._nouns import NOUNS
|
||||
from ._adjectives import ADJECTIVES
|
||||
from ._adverbs import ADVERBS
|
||||
from ....util import load_language_data
|
||||
|
||||
ADJECTIVES = set()
|
||||
ADVERBS = set()
|
||||
NOUNS = set()
|
||||
VERBS = set()
|
||||
|
||||
LEMMA_INDEX = {"adj": ADJECTIVES, "adv": ADVERBS, "noun": NOUNS, "verb": VERBS}
|
||||
|
||||
BASE_PATH = Path(__file__).parent
|
||||
|
||||
LEMMA_EXC = {
|
||||
"adj": ADJECTIVES_WORDFORMS,
|
||||
"adj": load_language_data(BASE_PATH / '_adjectives_wordforms.json'),
|
||||
"adv": ADVERBS_WORDFORMS,
|
||||
"noun": NOUNS_WORDFORMS,
|
||||
"verb": VERBS_WORDFORMS,
|
||||
"noun": load_language_data(BASE_PATH / '_nouns_wordforms.json'),
|
||||
"verb": load_language_data(BASE_PATH / '_verbs_wordforms.json'),
|
||||
}
|
||||
|
||||
LEMMA_RULES = {
|
||||
|
@ -29,3 +32,12 @@ LEMMA_RULES = {
|
|||
"verb": VERB_RULES,
|
||||
"punct": PUNCT_RULES,
|
||||
}
|
||||
|
||||
# Note on noun wordforms / lemmas:
|
||||
# All wordforms are extracted from Norsk Ordbank in Norwegian Bokmål 2005, updated 20180627
|
||||
# (CLARINO NB - Språkbanken), Nasjonalbiblioteket, Norway:
|
||||
# https://www.nb.no/sprakbanken/show?serial=oai%3Anb.no%3Asbr-5&lang=en
|
||||
# License:
|
||||
# Creative_Commons-BY (CC-BY) (https://creativecommons.org/licenses/by/4.0/)
|
||||
|
||||
|
||||
|
|
|
@ -1,8 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
ADJECTIVES = set(
|
||||
"""
|
||||
""".split()
|
||||
)
|
1
spacy/lang/nb/lemmatizer/_adjectives_wordforms.json
Normal file
1
spacy/lang/nb/lemmatizer/_adjectives_wordforms.json
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -1,8 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
ADVERBS = set(
|
||||
"""
|
||||
""".split()
|
||||
)
|
|
@ -1,8 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
NOUNS = set(
|
||||
"""
|
||||
""".split()
|
||||
)
|
File diff suppressed because it is too large
Load Diff
|
@ -1,8 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
VERBS = set(
|
||||
"""
|
||||
""".split()
|
||||
)
|
1
spacy/lang/nb/lemmatizer/_verbs_wordforms.json
Normal file
1
spacy/lang/nb/lemmatizer/_verbs_wordforms.json
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -1,6 +1,8 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from ._verbs_irreg import VERBS_IRREG
|
||||
from ._nouns_irreg import NOUNS_IRREG
|
||||
from ._adjectives_irreg import ADJECTIVES_IRREG
|
||||
|
@ -11,16 +13,20 @@ from ._determiners_irreg import DETERMINERS_IRREG
|
|||
from ._pronouns_irreg import PRONOUNS_IRREG
|
||||
|
||||
from ._verbs import VERBS
|
||||
from ._nouns import NOUNS
|
||||
from ._adjectives import ADJECTIVES
|
||||
|
||||
from ._adpositions import ADPOSITIONS
|
||||
from ._determiners import DETERMINERS
|
||||
|
||||
from .lookup import LOOKUP
|
||||
from ._lemma_rules import RULES
|
||||
from .lemmatizer import DutchLemmatizer
|
||||
|
||||
from ....util import load_language_data
|
||||
|
||||
BASE_PATH = Path(__file__).parent
|
||||
|
||||
LOOKUP = load_language_data(BASE_PATH / "lookup.json")
|
||||
NOUNS = load_language_data(BASE_PATH / "_nouns.json")
|
||||
|
||||
LEMMA_INDEX = {
|
||||
"adj": ADJECTIVES,
|
||||
|
|
1
spacy/lang/nl/lemmatizer/_nouns.json
Normal file
1
spacy/lang/nl/lemmatizer/_nouns.json
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
1
spacy/lang/nl/lemmatizer/lookup.json
Normal file
1
spacy/lang/nl/lemmatizer/lookup.json
Normal file
File diff suppressed because one or more lines are too long
|
@ -1,10 +1,11 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import LOOKUP
|
||||
from .tag_map import TAG_MAP
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
|
||||
|
@ -13,7 +14,7 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
|||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, add_lookups, load_language_data
|
||||
|
||||
|
||||
class PortugueseDefaults(Language.Defaults):
|
||||
|
@ -25,7 +26,8 @@ class PortugueseDefaults(Language.Defaults):
|
|||
lex_attr_getters.update(LEX_ATTRS)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
lemma_lookup = LOOKUP
|
||||
lemma_path = Path(__file__).parent / "lemmas.json"
|
||||
lemma_lookup = load_language_data(lemma_path)
|
||||
tag_map = TAG_MAP
|
||||
infixes = TOKENIZER_INFIXES
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
|
|
|
@ -1,8 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
LOOKUP = {
|
||||
{
|
||||
"Abris": "abril",
|
||||
"Agostos": "agosto",
|
||||
"Cérberos": "cérbero",
|
|
@ -1,15 +1,20 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lemmatizer import LOOKUP
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, add_lookups, load_language_data
|
||||
|
||||
# Lemma data note:
|
||||
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
|
||||
# Replaced characters using cedillas with the correct ones (ș and ț)
|
||||
|
||||
|
||||
class RomanianDefaults(Language.Defaults):
|
||||
|
@ -20,7 +25,8 @@ class RomanianDefaults(Language.Defaults):
|
|||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
lemma_lookup = LOOKUP
|
||||
lemma_path = Path(__file__).parent / "lemmas.json"
|
||||
lemma_lookup = load_language_data(lemma_path)
|
||||
|
||||
|
||||
class Romanian(Language):
|
||||
|
|
|
@ -1,10 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
|
||||
# Replaced characters using cedillas with the correct ones (ș and ț)
|
||||
|
||||
LOOKUP = {
|
||||
{
|
||||
"abale": "aba",
|
||||
"abalei": "aba",
|
||||
"abalele": "aba",
|
|
@ -1,8 +1,12 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .lookup import LOOKUP # noqa: F401
|
||||
from pathlib import Path
|
||||
|
||||
from ....util import load_language_data
|
||||
|
||||
lemma_path = Path(__file__).parent / "lookup.json"
|
||||
LOOKUP = load_language_data(lemma_path)
|
||||
|
||||
LEMMA_RULES = {
|
||||
"noun": [
|
||||
|
|
|
@ -1,8 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
LOOKUP = {
|
||||
{
|
||||
"A-bombens": "A-bomb",
|
||||
"A-bomberna": "A-bomb",
|
||||
"A-bombers": "A-bomb",
|
|
@ -1,15 +1,19 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lemmatizer import LOOKUP
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, add_lookups, load_language_data
|
||||
|
||||
# Lemma data source:
|
||||
# http://st2.zargan.com/duyuru/Zargan_Linguistic_Resources_for_Turkish.html - Bilgin, O. (2016). Biçimbilimsel Bakımdan Karmaşık Türkçe Kelimelerin İşlenmesinde Frekans Etkileri (yayınlanmamış yüksek lisans tezi). Boğaziçi Üniversitesi, İstanbul. Erişim: http://st2.zargan.com/public/resources/turkish/frequency_effects_in_turkish.pdf
|
||||
|
||||
|
||||
class TurkishDefaults(Language.Defaults):
|
||||
|
@ -20,7 +24,8 @@ class TurkishDefaults(Language.Defaults):
|
|||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
lemma_lookup = LOOKUP
|
||||
lemma_path = Path(__file__).parent / "lemmas.json"
|
||||
lemma_lookup = load_language_data(lemma_path)
|
||||
|
||||
|
||||
class Turkish(Language):
|
||||
|
|
|
@ -1,10 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# Source: http://st2.zargan.com/duyuru/Zargan_Linguistic_Resources_for_Turkish.html - Bilgin, O. (2016). Biçimbilimsel Bakımdan Karmaşık Türkçe Kelimelerin İşlenmesinde Frekans Etkileri (yayınlanmamış yüksek lisans tezi). Boğaziçi Üniversitesi, İstanbul. Erişim: http://st2.zargan.com/public/resources/turkish/frequency_effects_in_turkish.pdf
|
||||
|
||||
|
||||
LOOKUP = {
|
||||
{
|
||||
"çöğürü": "çöğür",
|
||||
"çöğüründen": "çöğür",
|
||||
"çöğür": "çöğür",
|
29103
spacy/lang/ur/lemmas.json
Normal file
29103
spacy/lang/ur/lemmas.json
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -117,6 +117,19 @@ def ensure_path(path):
|
|||
return path
|
||||
|
||||
|
||||
def load_language_data(path):
|
||||
"""Load JSON language data using the given path as a base.
|
||||
|
||||
If the provided path isn't present, will attempt to load a gzipped version
|
||||
before giving up.
|
||||
"""
|
||||
|
||||
try:
|
||||
return srsly.read_json(path)
|
||||
except FileNotFoundError:
|
||||
return srsly.read_gzip_json(path + ".gz")
|
||||
|
||||
|
||||
def load_model(name, **overrides):
|
||||
"""Load a model from a shortcut link, package or data path.
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user