Move lookup tables out of the core library (#4346)

* Add default to util.get_entry_point

* Tidy up entry points

* Read lookups from entry points

* Remove lookup tables and related tests

* Add lookups install option

* Remove lemmatizer tests

* Remove logic to process language data files

* Update setup.cfg
This commit is contained in:
Ines Montani 2019-10-01 00:01:27 +02:00 committed by Matthew Honnibal
parent ed620daa5c
commit e0cf4796a5
91 changed files with 32 additions and 8664173 deletions

View File

@ -46,6 +46,8 @@ install_requires =
pathlib==1.0.1; python_version < "3.4" pathlib==1.0.1; python_version < "3.4"
[options.extras_require] [options.extras_require]
lookups =
spacy_lookups_data>=0.0.4<0.2.0
cuda = cuda =
thinc_gpu_ops>=0.0.1,<0.1.0 thinc_gpu_ops>=0.0.1,<0.1.0
cupy>=5.0.0b4 cupy>=5.0.0b4

View File

@ -115,23 +115,6 @@ def generate_cython(root, source):
raise RuntimeError("Running cythonize failed") raise RuntimeError("Running cythonize failed")
def gzip_language_data(root, source):
print("Compressing language data")
import srsly
from pathlib import Path
base = Path(root) / source
for jsonfile in base.glob("**/*.json"):
outfile = jsonfile.with_suffix(jsonfile.suffix + ".gz")
if outfile.is_file() and outfile.stat().st_mtime > jsonfile.stat().st_mtime:
# If the gz is newer it doesn't need updating
print("Skipping {}, already compressed".format(jsonfile))
continue
data = srsly.read_json(jsonfile)
srsly.write_gzip_json(outfile, data)
print("Compressed {}".format(jsonfile))
def is_source_release(path): def is_source_release(path):
return os.path.exists(os.path.join(path, "PKG-INFO")) return os.path.exists(os.path.join(path, "PKG-INFO"))
@ -203,7 +186,6 @@ def setup_package():
if not is_source_release(root): if not is_source_release(root):
generate_cython(root, "spacy") generate_cython(root, "spacy")
gzip_language_data(root, "spacy/lang")
setup( setup(
name="spacy", name="spacy",

View File

@ -5,7 +5,7 @@ import uuid
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
from ..util import minify_html, escape_html, get_entry_points from ..util import minify_html, escape_html, get_entry_points, ENTRY_POINTS
from ..errors import Errors from ..errors import Errors
@ -242,7 +242,7 @@ class EntityRenderer(object):
"CARDINAL": "#e4e7d2", "CARDINAL": "#e4e7d2",
"PERCENT": "#e4e7d2", "PERCENT": "#e4e7d2",
} }
user_colors = get_entry_points("spacy_displacy_colors") user_colors = get_entry_points(ENTRY_POINTS.displacy_colors)
for user_color in user_colors.values(): for user_color in user_colors.values():
colors.update(user_color) colors.update(user_color)
colors.update(options.get("colors", {})) colors.update(options.get("colors", {}))

View File

@ -21,8 +21,6 @@ class BengaliDefaults(Language.Defaults):
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
# Lemma rules: উচ্চতর বাংলা ব্যাকরণ ও রচনা - অধ্যাপক নিরঞ্জন অধিকারী ও অধ্যাপক ড. সফিউদ্দিন আহমদ
resources = {"lemma_rules": "lemma_rules.json"}
class Bengali(Language): class Bengali(Language):

View File

@ -1,57 +0,0 @@
{
"noun": [
["টা", ""],
["টি", ""],
["খান", ""],
["খানা", ""],
["খানি", ""],
["গাছা", ""],
["গাছি", ""],
["ছড়া", ""],
["কে", ""],
["ে", ""],
["তে", ""],
["র", ""],
["রা", ""],
["রে", ""],
["ের", ""],
["েরা", ""],
["দের", ""],
["দেরকে", ""],
["গুলা", ""],
["গুলো", ""],
["গুলি", ""],
["কুল", ""],
["গণ", ""],
["দল", ""],
["পাল", ""],
["পুঞ্জ", ""],
["মণ্ডলী", ""],
["মালা", ""],
["রাজি", ""],
["বৃন্দ", ""],
["বর্গ", ""],
["শ্রেণী", ""],
["শ্রেনি", ""],
["রাশি", ""],
["সকল", ""],
["মহল", ""],
["াবলি", ""],
["", "0"],
["১", "1"],
["২", "2"],
["৩", "3"],
["", "4"],
["৫", "5"],
["৬", "6"],
["", "7"],
["৮", "8"],
["৯", "9"]
],
"punct": [
["“", "\""],
["”", "\""],
["", "'"],
["", "'"]
]
}

View File

@ -24,7 +24,6 @@ class CatalanDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
resources = {"lemma_lookup": "lemma_lookup.json"}
class Catalan(Language): class Catalan(Language):

File diff suppressed because it is too large Load Diff

View File

@ -29,7 +29,6 @@ class DanishDefaults(Language.Defaults):
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
tag_map = TAG_MAP tag_map = TAG_MAP
stop_words = STOP_WORDS stop_words = STOP_WORDS
resources = {"lemma_lookup": "lemma_lookup.json"}
class Danish(Language): class Danish(Language):

File diff suppressed because it is too large Load Diff

View File

@ -26,7 +26,6 @@ class GermanDefaults(Language.Defaults):
tag_map = TAG_MAP tag_map = TAG_MAP
stop_words = STOP_WORDS stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
resources = {"lemma_lookup": "lemma_lookup.json"}
single_orth_variants = [ single_orth_variants = [
{"tags": ["$("], "variants": ["", "..."]}, {"tags": ["$("], "variants": ["", "..."]},
{"tags": ["$("], "variants": ["-", "", "", "--", "---", "——"]}, {"tags": ["$("], "variants": ["-", "", "", "--", "---", "——"]},

File diff suppressed because it is too large Load Diff

View File

@ -31,11 +31,6 @@ class GreekDefaults(Language.Defaults):
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
resources = {
"lemma_index": "lemmatizer/lemma_index.json",
"lemma_exc": "lemmatizer/lemma_exc.json",
"lemma_rules": "lemmatizer/lemma_rules.json",
}
@classmethod @classmethod
def create_lemmatizer(cls, nlp=None, lookups=None): def create_lemmatizer(cls, nlp=None, lookups=None):

View File

@ -1,7 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ....symbols import NOUN, VERB, ADJ, PUNCT from ...symbols import NOUN, VERB, ADJ, PUNCT
class GreekLemmatizer(object): class GreekLemmatizer(object):

View File

@ -1,236 +0,0 @@
{
"adj": {
"χειρότερος": ["κακός"],
"χειρότερη": ["κακός"],
"χειρότερης": ["κακός"],
"χειρότερο": ["κακός"],
"χειρότεροι": ["κακός"],
"χειρότερων": ["κακός"],
"χειρότερου": ["κακός"],
"βέλτιστος": ["καλός"],
"βέλτιστη": ["καλός"],
"βέλτιστης": ["καλός"],
"βέλτιστο": ["καλός"],
"βέλτιστοι": ["καλός"],
"βέλτιστων": ["καλός"],
"βέλτιστου": ["καλός"],
"ελάχιστος": ["λίγος"],
"ελάχιστα": ["λίγος"],
"ελάχιστοι": ["λίγος"],
"ελάχιστων": ["λίγος"],
"ελάχιστη": ["λίγος"],
"ελάχιστης": ["λίγος"],
"ελάχιστο": ["λίγος"],
"ελάχιστου": ["λίγος"],
"πλείστος": ["πολύς"],
"πλείστου": ["πολύς"],
"πλείστων": ["πολύς"],
"πολλή": ["πολύ"],
"πολύς": ["πολύ"],
"πολλύ": ["πολύ"],
"πολλύς": ["πολύ"]
},
"noun": {
"λευτεριά": ["ελευθερία"],
"καφέδες": ["καφές"],
"ποιήματα": ["ποίημα"]
},
"det": {
"του": ["το"],
"των": ["το"],
"τους": ["το"],
"τις": ["τη"],
"τα": ["το"],
"οι": ["ο", "η"]
},
"verb": {
"είσαι": ["είμαι"],
"είναι": ["είμαι"],
"είμαστε": ["είμαι"],
"είστε": ["είμαι"],
"είσαστε": ["είμαι"],
"ήμουν": ["είμαι"],
"ήσουν": ["είμαι"],
"ήταν": ["είμαι"],
"ήμαστε": ["είμαι"],
"ήμασταν": ["είμαι"],
"είπα": ["λέω"],
"είπες": ["λέω"],
"είπε": ["λέω"],
"είπαμε": ["λέω"],
"είπατε": ["λέω"],
"είπαν": ["λέω"],
"είπανε": ["λέω"],
"πει": ["λέω"],
"πω": ["λέω"],
"πάω": ["πηγαίνω"],
"πάς": ["πηγαίνω"],
"πας": ["πηγαίνω"],
"πάει": ["πηγαίνω"],
"πάμε": ["πηγαίνω"],
"πάτε": ["πηγαίνω"],
"πάνε": ["πηγαίνω"],
"πήγα": ["πηγαίνω"],
"πήγες": ["πηγαίνω"],
"πήγε": ["πηγαίνω"],
"πήγαμε": ["πηγαίνω"],
"πήγατε": ["πηγαίνω"],
"πήγαν": ["πηγαίνω"],
"πήγανε": ["πηγαίνω"],
"έπαιζα": ["παίζω"],
"έπαιζες": ["παίζω"],
"έπαιζε": ["παίζω"],
"έπαιζαν": ["παίζω,"],
"έπαιξα": ["παίζω"],
"έπαιξες": ["παίζω"],
"έπαιξε": ["παίζω"],
"έτρωγα": ["τρώω"],
"έτρωγε": ["τρώω"],
"είχα": ["έχω"],
"είχες": ["έχω"],
"είχε": ["έχω"],
"είχαμε": ["έχω"],
"είχατε": ["έχω"],
"είχαν": ["έχω"],
"είχανε": ["έχω"],
"έπαιρνα": ["παίρνω"],
"έπαιρνες": ["παίρνω"],
"έπαιρνε": ["παίρνω"],
"έπαιρναν": ["παίρνω"],
"εδίνα": ["δίνω"],
"εδίνες": ["δίνω"],
"εδίνε": ["δίνω"],
"εδίναν": ["δίνω"],
"έκανα": ["κάνω"],
"έκανες": ["κάνω"],
"έκανε": ["κάνω"],
"έκαναν": ["κάνω"],
"ήθελα": ["θέλω"],
"ήθελες": ["θέλω"],
"ήθελε": ["θέλω"],
"ήθελαν": ["θέλω"],
"έβλεπα": ["βλέπω"],
"έβλεπες": ["βλέπω"],
"έβλεπε": ["βλέπω"],
"έβλεπαν": ["βλέπω"],
"είδα": ["βλέπω"],
"είδες": ["βλέπω"],
"είδε": ["βλέπω"],
"είδαμε": ["βλέπω"],
"είδατε": ["βλέπω"],
"είδαν": ["βλέπω"],
"έφερνα": ["φέρνω"],
"έφερνες": ["φέρνω"],
"έφερνε": ["φέρνω"],
"έφερναν": ["φέρνω"],
"έφερα": ["φέρω"],
"έφερες": ["φέρω"],
"έφερε": ["φέρω"],
"έφεραν": ["φέρω"],
"έλαβα": ["λαμβάνω"],
"έλαβες": ["λαμβάνω"],
"έλαβε": ["λαμβάνω"],
"έλαβαν": ["λαμβάνω"],
"έβρισκα": ["βρίσκω"],
"έβρισκες": ["βρίσκω"],
"έβρισκε": ["βρίσκω"],
"έβρισκαν": ["βρίσκω"],
"ήξερα": ["ξέρω"],
"ήξερες": ["ξέρω"],
"ήξερε": ["ξέρω"],
"ήξεραν": ["ξέρω"],
"ανέφερα": ["αναφέρω"],
"ανέφερες": ["αναφέρω"],
"ανέφερε": ["αναφέρω"],
"ανέφεραν": ["αναφέρω"],
"έβαζα": ["βάζω"],
"έβαζες": ["βάζω"],
"έβαζε": ["βάζω"],
"έβαζαν": ["βάζω"],
"έμεινα": ["μένω"],
"έμεινες": ["μένω"],
"έμεινε": ["μένω"],
"έμειναν": ["μένω"],
"έβγαζα": ["βγάζω"],
"έβγαζες": ["βγάζω"],
"έβγαζε": ["βγάζω"],
"έβγαζαν": ["βγάζω"],
"έμπαινα": ["μπαίνω"],
"έμπαινες": ["μπαίνω"],
"έμπαινε": ["μπαίνω"],
"έμπαιναν": ["μπαίνω"],
"βγήκα": ["βγαίνω"],
"βγήκες": ["βγαίνω"],
"βγήκε": ["βγαίνω"],
"βγήκαμε": ["βγαίνω"],
"βγήκατε": ["βγαίνω"],
"βγήκαν": ["βγαίνω"],
"έπεφτα": ["πέφτω"],
"έπεφτες": ["πέφτω"],
"έπεφτε": ["πέφτω"],
"έπεφταν": ["πέφτω"],
"έπεσα": ["πέφτω"],
"έπεσες": ["πέφτω"],
"έπεσε": ["πέφτω"],
"έπεσαν": ["πέφτω"],
"έστειλα": ["στέλνω"],
"έστειλες": ["στέλνω"],
"έστειλε": ["στέλνω"],
"έστειλαν": ["στέλνω"],
"έφυγα": ["φεύγω"],
"έφυγες": ["φεύγω"],
"έφυγαν": ["φεύγω"],
"έμαθα": ["μαθαίνω"],
"έμαθες": ["μαθαίνω"],
"έμαθε": ["μαθαίνω"],
"έμαθαν": ["μαθαίνω"],
"υπέβαλλα": ["υποβάλλω"],
"υπέβαλλες": ["υποβάλλω"],
"υπέβαλλε": ["υποβάλλω"],
"υπέβαλλαν": ["υποβάλλω"],
"έπινα": ["πίνω"],
"έπινες": ["πίνω"],
"έπινε": ["πίνω"],
"έπιναν": ["πίνω"],
"ήπια": ["πίνω"],
"ήπιες": ["πίνω"],
"ήπιε": ["πίνω"],
"ήπιαμε": ["πίνω"],
"ήπιατε": ["πίνω"],
"ήπιαν": ["πίνω"],
"ετύχα": ["τυχαίνω"],
"ετύχες": ["τυχαίνω"],
"ετύχε": ["τυχαίνω"],
"ετύχαν": ["τυχαίνω"],
"φάω": ["τρώω"],
"φάς": ["τρώω"],
"φάει": ["τρώω"],
"φάμε": ["τρώω"],
"φάτε": ["τρώω"],
"φάνε": ["τρώω"],
"φάν": ["τρώω"],
"έτρωγες": ["τρώω"],
"τρώγαμε": ["τρώω"],
"τρώγατε": ["τρώω"],
"τρώγανε": ["τρώω"],
"τρώγαν": ["τρώω"],
"πέρασα": ["περνώ"],
"πέρασες": ["περνώ"],
"πέρασε": ["περνώ"],
"πέρασαμε": ["περνώ"],
"πέρασατε": ["περνώ"],
"πέρασαν": ["περνώ"],
"έγδαρα": ["γδάρω"],
"έγδαρες": ["γδάρω"],
"έγδαρε": ["γδάρω"],
"έγδαραν": ["γδάρω"],
"έβγαλα": ["βγάλω"],
"έβγαλες": ["βγάλω"],
"έβγαλε": ["βγάλω"],
"έβγαλαν": ["βγάλω"],
"έφθασα": ["φτάνω"],
"έφθασες": ["φτάνω"],
"έφθασε": ["φτάνω"],
"έφθασαν": ["φτάνω"]
}
}

File diff suppressed because one or more lines are too long

View File

@ -1,139 +0,0 @@
{
"adj": [
["οί", "ός"],
["ών", "ός"],
["ού", "ός"],
["ή", "ός"],
["ής", "ός"],
["ές", "ός"],
["οι", "ος"],
["ων", "ος"],
["ου", "ος"],
["ο", "ος"],
["α", "ος"],
["ώδη", "ώδες"],
["ύτερη", "ός"],
["ύτερης", "ός"],
["ύτερων", "ός"],
["ύτερος", "ός"],
["ύτερου", "ός"]
],
"noun": [
["ιού", "ί"],
["ιά", "ί"],
["ιών", "ί"],
["ηριού", "ήρι"],
["ια", "ι"],
["ηριών", "ήρι"],
["ας", "α"],
["ες", "α"],
["ων", "α"],
["άς", "ά"],
["ές", "ά"],
["ών", "ά"],
["ής", "ή"],
["ές", "ή"],
["ών", "ή"],
["ές", "ής"],
["ών", "ής"],
["ου", "ο"],
["α", "ο"],
["ων", "ο"],
["ητήματος", "ήτημα"],
["ητήματα", "ήτημα"],
["ητημάτων", "ήτημα"],
["τος", ""],
["τα", "α"],
["ομάτων", "όμα"],
["ού", "ός"],
["οί", "ός"],
["ών", "ός"],
["ς", ""],
["ες", "α"],
["ιών", "ία"],
["α", "ας"],
["δων", ""]
],
"verb": [
["εις", "ω"],
["ει", "ω"],
["ουμε", "ω"],
["ετε", "ω"],
["ουνε", "ω"],
["ουν", "ω"],
["είς", "ώ"],
["εί", "ώ"],
["ούν", "ώ"],
["εσαι", "ομαι"],
["εται", "ομαι"],
["ανόμαστε", "άνομαι"],
["εστε", "ομαι"],
["ονται", "ομαι"],
["άς", "ώ"],
["άει", "ώ"],
["άμε", "ώ"],
["άτε", "ώ"],
["άνε", "ώ"],
["άν", "ώ"],
["άω", "ώ"],
["ώ", "άω"],
["ιζόμουν", "ίζομαι"],
["ιζόσουν", "ίζομαι"],
["ιζόταν", "ίζομαι"],
["ιζόμασταν", "ίζομαι"],
["ιζόσασταν", "ίζομαι"],
["ονταν", "ομαι"],
["όμουν", "άμαι"],
["όσουν", "άμαι"],
["όταν", "άμαι"],
["όμασταν", "άμαι"],
["όσασταν", "άμαι"],
["όντουσταν", "άμαι"],
["ούσα", "ώ"],
["ούσες", "ώ"],
["ούσε", "ώ"],
["ούσαμε", "ώ"],
["ούσατε", "ώ"],
["ούσαν", "ώ"],
["ούσανε", "ώ"],
["λαμε", "ζω"],
["λατε", "ζω"],
["ήρα", "άρω"],
["ήρες", "άρω"],
["ήρε", "άρω"],
["ήραμε", "άρω"],
["ήρατε", "άρω"],
["ήρα", "άρω"],
["ένησα", "ενώ"],
["ένησες", "ενώ"],
["ένησε", "ενώ"],
["ενήσαμε", "ενώ"],
["ένησατε", "ενώ"],
["ένησαν", "ενώ"],
["όνεσα", "ονώ"],
["όνεσες", "ονώ"],
["όνεσε", "ονώ"],
["έσαμε", "ώ"],
["έσατε", "ώ"],
["ισα", "ομαι"],
["ισες", "ομαι"],
["ισε", "ομαι"],
["αθίσαμε", "άθομαι"],
["αθίσατε", "άθομαι"],
["ισαν", "ομαι"],
["άπα", "απώ"],
["ά", "ώ"],
["οντας", "ω"],
["ξω", "ζω"],
["ξεις", "ζω"],
["ξουμε", "ζω"],
["ξετε", "ζω"],
["ξουν", "ζω"]
],
"punct": [
["“", "\""],
["”", "\""],
["", "'"],
["", "'"]
]
}

View File

@ -32,12 +32,6 @@ class EnglishDefaults(Language.Defaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
morph_rules = MORPH_RULES morph_rules = MORPH_RULES
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
resources = {
"lemma_lookup": "lemmatizer/lemma_lookup.json",
"lemma_rules": "lemmatizer/lemma_rules.json",
"lemma_index": "lemmatizer/lemma_index.json",
"lemma_exc": "lemmatizer/lemma_exc.json",
}
single_orth_variants = [ single_orth_variants = [
{"tags": ["NFP"], "variants": ["", "..."]}, {"tags": ["NFP"], "variants": ["", "..."]},
{"tags": [":"], "variants": ["-", "", "", "--", "---", "——"]}, {"tags": [":"], "variants": ["-", "", "", "--", "---", "——"]},

View File

@ -1,31 +0,0 @@
WordNet Release 3.0
This software and database is being provided to you, the LICENSEE, by
Princeton University under the following license. By obtaining, using
and/or copying this software and database, you agree that you have
read, understood, and will comply with these terms and conditions.:
Permission to use, copy, modify and distribute this software and
database and its documentation for any purpose and without fee or
royalty is hereby granted, provided that you agree to comply with
the following copyright notice and statements, including the disclaimer,
and that the same appear on ALL copies of the software, database and
documentation, including modifications that you make for internal
use or for distribution.
WordNet 3.0 Copyright 2006 by Princeton University. All rights reserved.
THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON
UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON
UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT
INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
OTHER RIGHTS.
The name of Princeton University or Princeton may not be used in
advertising or publicity pertaining to distribution of the software
and/or database. Title to copyright in this software, database and
any associated documentation shall at all times remain with
Princeton University and LICENSEE agrees to preserve same.

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -1,35 +0,0 @@
{
"adj": [
["er", ""],
["est", ""],
["er", "e"],
["est", "e"]
],
"noun": [
["s", ""],
["ses", "s"],
["ves", "f"],
["xes", "x"],
["zes", "z"],
["ches", "ch"],
["shes", "sh"],
["men", "man"],
["ies", "y"]
],
"verb": [
["s", ""],
["ies", "y"],
["es", "e"],
["es", ""],
["ed", "e"],
["ed", ""],
["ing", "e"],
["ing", ""]
],
"punct": [
["“", "\""],
["”", "\""],
["", "'"],
["", "'"]
]
}

View File

@ -25,7 +25,6 @@ class SpanishDefaults(Language.Defaults):
tag_map = TAG_MAP tag_map = TAG_MAP
stop_words = STOP_WORDS stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
resources = {"lemma_lookup": "lemma_lookup.json"}
class Spanish(Language): class Spanish(Language):

File diff suppressed because it is too large Load Diff

View File

@ -24,12 +24,6 @@ class PersianDefaults(Language.Defaults):
tag_map = TAG_MAP tag_map = TAG_MAP
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
# extracted from Mojgan Seraji's Persian Universal Dependencies Corpus
resources = {
"lemma_rules": "lemmatizer/lemma_rules.json",
"lemma_index": "lemmatizer/lemma_index.json",
"lemma_exc": "lemmatizer/lemma_exc.json",
}
class Persian(Language): class Persian(Language):

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -1,41 +0,0 @@
{
"adj": [
["ین", ""],
["‌ترین", ""],
["ترین", ""],
["‌تر", ""],
["تر", ""],
["‌ای", ""]
],
"noun": [
["ایان", "ا"],
["ویان", "و"],
["ایانی", "ا"],
["ویانی", "و"],
["گان", "ه"],
["گانی", "ه"],
["گان", ""],
["گانی", ""],
["ان", ""],
["انی", ""],
["ات", ""],
["ات", "ه"],
["ات", "ت"],
["اتی", ""],
["اتی", "ه"],
["اتی", "ت"],
["ها", ""],
["ها", ""],
["‌های", ""],
["های", ""],
["‌هایی", ""],
["هایی", ""]
],
"verb": [],
"punct": [
["“", "\""],
["”", "\""],
["", "'"],
["", "'"]
]
}

View File

@ -30,12 +30,6 @@ class FrenchDefaults(Language.Defaults):
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
token_match = TOKEN_MATCH token_match = TOKEN_MATCH
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
resources = {
"lemma_rules": "lemmatizer/lemma_rules.json",
"lemma_index": "lemmatizer/lemma_index.json",
"lemma_exc": "lemmatizer/lemma_exc.json",
"lemma_lookup": "lemmatizer/lemma_lookup.json",
}
@classmethod @classmethod
def create_lemmatizer(cls, nlp=None, lookups=None): def create_lemmatizer(cls, nlp=None, lookups=None):

View File

@ -1,9 +1,9 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
from ....symbols import SCONJ, CCONJ from ...symbols import SCONJ, CCONJ
from ....symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos from ...symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
class FrenchLemmatizer(object): class FrenchLemmatizer(object):

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -1,126 +0,0 @@
{
"adj": [
["a", "a"],
["aux", "al"],
["c", "c"],
["d", "d"],
["e", ""],
["é", "é"],
["eux", "eux"],
["f", "f"],
["i", "i"],
["ï", "ï"],
["l", "l"],
["m", "m"],
["n", "n"],
["o", "o"],
["p", "p"],
["r", "r"],
["s", ""],
["t", "t"],
["u", "u"],
["y", "y"]
],
"noun": [
["a", "a"],
["à", "à"],
["â", "â"],
["b", "b"],
["c", "c"],
["ç", "ç"],
["d", "d"],
["e", "e"],
["é", "é"],
["è", "è"],
["ê", "ê"],
["ë", "ë"],
["f", "f"],
["g", "g"],
["h", "h"],
["i", "i"],
["î", "î"],
["ï", "ï"],
["j", "j"],
["k", "k"],
["l", "l"],
["m", "m"],
["n", "n"],
["o", "o"],
["ô", "ö"],
["ö", "ö"],
["p", "p"],
["q", "q"],
["r", "r"],
["t", "t"],
["u", "u"],
["û", "û"],
["v", "v"],
["w", "w"],
["y", "y"],
["z", "z"],
["s", ""],
["x", ""],
["nt(e", "nt"],
["nt(e)", "nt"],
["al(e", "ale"],
["é(", "é"],
["é(e", "é"],
["é.e", "é"],
["el(le", "el"],
["eurs(rices", "eur"],
["eur(rice", "eur"],
["eux(se", "eux"],
["ial(e", "ial"],
["er(ère", "er"],
["eur(se", "eur"],
["teur(trice", "teur"],
["teurs(trices", "teur"]
],
"verb": [
["é", "er"],
["és", "er"],
["ée", "er"],
["ées", "er"],
["é", "er"],
["es", "er"],
["ons", "er"],
["ez", "er"],
["ent", "er"],
["ais", "er"],
["ait", "er"],
["ions", "er"],
["iez", "er"],
["aient", "er"],
["ai", "er"],
["as", "er"],
["a", "er"],
["âmes", "er"],
["âtes", "er"],
["èrent", "er"],
["erai", "er"],
["eras", "er"],
["era", "er"],
["erons", "er"],
["erez", "er"],
["eront", "er"],
["erais", "er"],
["erait", "er"],
["erions", "er"],
["eriez", "er"],
["eraient", "er"],
["asse", "er"],
["asses", "er"],
["ât", "er"],
["assions", "er"],
["assiez", "er"],
["assent", "er"],
["ant", "er"],
["ante", "er"],
["ants", "er"],
["antes", "er"],
["u(er", "u"],
["és(ées", "er"],
["é()e", "er"],
["é()", "er"]
]
}

View File

@ -18,7 +18,6 @@ class CroatianDefaults(Language.Defaults):
) )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
resources = {"lemma_lookup": "lemma_lookup.json"}
class Croatian(Language): class Croatian(Language):

File diff suppressed because it is too large Load Diff

View File

@ -24,7 +24,6 @@ class HungarianDefaults(Language.Defaults):
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
token_match = TOKEN_MATCH token_match = TOKEN_MATCH
resources = {"lemma_lookup": "lemma_lookup.json"}
class Hungarian(Language): class Hungarian(Language):

File diff suppressed because it is too large Load Diff

View File

@ -30,7 +30,6 @@ class IndonesianDefaults(Language.Defaults):
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
tag_map = TAG_MAP tag_map = TAG_MAP
resources = {"lemma_lookup": "lemma_lookup.json"}
class Indonesian(Language): class Indonesian(Language):

File diff suppressed because it is too large Load Diff

View File

@ -23,7 +23,6 @@ class ItalianDefaults(Language.Defaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
tag_map = TAG_MAP tag_map = TAG_MAP
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
resources = {"lemma_lookup": "lemma_lookup.json"}
class Italian(Language): class Italian(Language):

File diff suppressed because it is too large Load Diff

View File

@ -30,7 +30,6 @@ class LithuanianDefaults(Language.Defaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
tag_map = TAG_MAP tag_map = TAG_MAP
morph_rules = MORPH_RULES morph_rules = MORPH_RULES
resources = {"lemma_lookup": "lemma_lookup.json"}
class Lithuanian(Language): class Lithuanian(Language):

File diff suppressed because it is too large Load Diff

View File

@ -25,11 +25,6 @@ class NorwegianDefaults(Language.Defaults):
morph_rules = MORPH_RULES morph_rules = MORPH_RULES
tag_map = TAG_MAP tag_map = TAG_MAP
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
resources = {
"lemma_lookup": "lemmatizer/lemma_lookup.json",
"lemma_exc": "lemmatizer/lemma_exc.json",
"lemma_rules": "lemmatizer/lemma_rules.json",
}
class Norwegian(Language): class Norwegian(Language):

View File

@ -1,7 +0,0 @@
Note on noun wordforms / lemmas:
All wordforms are extracted from Norsk Ordbank in Norwegian Bokmål 2005, updated 20180627
(CLARINO NB - Språkbanken), Nasjonalbiblioteket, Norway:
https://www.nb.no/sprakbanken/show?serial=oai%3Anb.no%3Asbr-5&lang=en
License:
Creative_Commons-BY (CC-BY) (https://creativecommons.org/licenses/by/4.0/)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,24 +0,0 @@
{
"adj": [
["e", ""],
["ere", ""],
["est", ""],
["este", ""]
],
"noun": [
["en", "e"],
["a", "e"],
["et", ""],
["er", "e"],
["ene", "e"]
],
"verb": [
["er", "e"],
["et", "e"],
["a", "e"],
["es", "e"],
["te", "e"],
["år", "å"]
],
"punct": []
}

View File

@ -26,12 +26,6 @@ class DutchDefaults(Language.Defaults):
tag_map = TAG_MAP tag_map = TAG_MAP
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
resources = {
"lemma_rules": "lemmatizer/lemma_rules.json",
"lemma_index": "lemmatizer/lemma_index.json",
"lemma_exc": "lemmatizer/lemma_exc.json",
"lemma_lookup": "lemmatizer/lemma_lookup.json",
}
@classmethod @classmethod
def create_lemmatizer(cls, nlp=None, lookups=None): def create_lemmatizer(cls, nlp=None, lookups=None):

View File

@ -1,7 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ....symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
class DutchLemmatizer(object): class DutchLemmatizer(object):

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -1,55 +0,0 @@
{
"adj": [
["sten", ""],
["ende", "end"],
["ste", ""],
["st", ""],
["er", ""],
["en", ""],
["e", ""]
],
"noun": [
["heden", "heid"],
["elen", "eel"],
["ezen", "ees"],
["even", "eef"],
["ssen", "s"],
["rren", "r"],
["kken", "k"],
["bben", "b"],
["'er", ""],
["tje", ""],
["kje", ""],
["ici", "icus"],
["en", ""],
["ën", ""],
["'s", ""],
["s", ""]
],
"verb": [
["dden", "den"],
["tten", "ten"],
["dde", "den"],
["tte", "ten"],
["end", "en"],
["dt", "den"],
["de", "en"],
["te", "en"]
],
"num": [
["sten", ""],
["tjes", ""],
["ste", ""],
["ën", ""],
["en", ""],
["de", ""],
["er", ""],
["ër", ""]
],
"punct": [
["“", "\""],
["”", "\""],
["", "'"],
["", "'"]
]
}

View File

@ -27,7 +27,6 @@ class PortugueseDefaults(Language.Defaults):
tag_map = TAG_MAP tag_map = TAG_MAP
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
resources = {"lemma_lookup": "lemma_lookup.json"}
class Portuguese(Language): class Portuguese(Language):

File diff suppressed because it is too large Load Diff

View File

@ -24,7 +24,6 @@ class RomanianDefaults(Language.Defaults):
) )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
resources = {"lemma_lookup": "lemma_lookup.json"}
tag_map = TAG_MAP tag_map = TAG_MAP

File diff suppressed because it is too large Load Diff

View File

@ -21,7 +21,6 @@ class SerbianDefaults(Language.Defaults):
) )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
resources = {"lemma_lookup": "lemma_lookup.json"}
class Serbian(Language): class Serbian(Language):

File diff suppressed because it is too large Load Diff

View File

@ -29,10 +29,6 @@ class SwedishDefaults(Language.Defaults):
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS stop_words = STOP_WORDS
morph_rules = MORPH_RULES morph_rules = MORPH_RULES
resources = {
"lemma_lookup": "lemmatizer/lemma_lookup.json",
"lemma_rules": "lemmatizer/lemma_rules.json",
}
class Swedish(Language): class Swedish(Language):

File diff suppressed because it is too large Load Diff

View File

@ -1,103 +0,0 @@
{
"noun": [
["t", ""],
["n", ""],
["na", ""],
["na", "e"],
["or", "a"],
["orna", "a"],
["et", ""],
["en", ""],
["en", "e"],
["er", ""],
["erna", ""],
["ar", "e"],
["ar", ""],
["lar", "el"],
["arna", "e"],
["arna", ""],
["larna", "el"]
],
"verb": [
["r", ""],
["de", ""],
["t", ""],
["er", ""],
["te", ""],
["a", ""],
["e", ""],
["t", "d"],
["tt", "d"],
["tt", ""],
["ev", "iv"],
["ack", "ick"],
["ög", "yg"],
["it", ""],
["uckit", "ick"],
["ugit", "yg"],
["it", "et"],
["id", "ed"],
["ip", "ep"],
["iv", "ev"],
["in", "en"],
["ik", "ek"],
["ig", "eg"],
["ind", ""],
["inn", "ann"],
["nder", "nd"],
["inner", "inn"],
["and", "ind"],
["ann", "inn"],
["s", ""],
["anns", "inn"],
["undit", "ind"],
["unnit", "inn"],
["unnits", "inn"],
["uppit", "ipp"],
["ungit", "ing"],
["öd", "ud"],
["öt", "jut"],
["öt", "ut"],
["ög", "ug"],
["ögg", "ugg"],
["öng", "ung"],
["önk", "unk"],
["öt", "yt"],
["utit", "yt"],
["ös", "ys"],
["öv", "yv"],
["uvit", "yv"],
["öp", "yp"],
["upit", "yp"],
["ök", "yk"],
["ukit", "yk"],
["or", "ar"],
["öll", "all"],
["ät", "åt"],
["öll", "åll"],
["or", "är"],
["urit", "är"],
["åt", "ät"],
["ar", "är"],
["alt", "ält"],
["ultit", "ält"]
],
"adj": [
["are", ""],
["ast", ""],
["re", ""],
["st", ""],
["ägre", "åg"],
["ägst", "åg"],
["ängre", "ång"],
["ängst", "ång"],
["örre", "or"],
["örst", "or"]
],
"punct": [
["“", "\""],
["”", "\""],
["", "'"],
["", "'"]
]
}

View File

@ -24,7 +24,6 @@ class TagalogDefaults(Language.Defaults):
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
resources = {"lemma_lookup": "lemma_lookup.json"}
class Tagalog(Language): class Tagalog(Language):

View File

@ -1,9 +0,0 @@
{
"kaugnayan": "ugnay",
"sangkatauhan": "tao",
"kanayunan": "nayon",
"pandaigdigan": "daigdig",
"kasaysayan": "saysay",
"kabayanihan": "bayani",
"karuwagan": "duwag"
}

View File

@ -10,9 +10,6 @@ from ...language import Language
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
# Lemma data source:
# http://st2.zargan.com/duyuru/Zargan_Linguistic_Resources_for_Turkish.html - Bilgin, O. (2016). Biçimbilimsel Bakımdan Karmaşık Türkçe Kelimelerin İşlenmesinde Frekans Etkileri (yayınlanmamış yüksek lisans tezi). Boğaziçi Üniversitesi, İstanbul. Erişim: http://st2.zargan.com/public/resources/turkish/frequency_effects_in_turkish.pdf
class TurkishDefaults(Language.Defaults): class TurkishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
@ -22,7 +19,6 @@ class TurkishDefaults(Language.Defaults):
) )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
resources = {"lemma_lookup": "lemma_lookup.json"}
class Turkish(Language): class Turkish(Language):

File diff suppressed because it is too large Load Diff

View File

@ -21,7 +21,6 @@ class UrduDefaults(Language.Defaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
resources = {"lemma_lookup": "lemma_lookup.json"}
class Urdu(Language): class Urdu(Language):

File diff suppressed because it is too large Load Diff

View File

@ -25,7 +25,7 @@ from .compat import izip, basestring_
from .gold import GoldParse from .gold import GoldParse
from .scorer import Scorer from .scorer import Scorer
from ._ml import link_vectors_to_models, create_default_optimizer from ._ml import link_vectors_to_models, create_default_optimizer
from .attrs import IS_STOP from .attrs import IS_STOP, LANG
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES from .lang.punctuation import TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tokenizer_exceptions import TOKEN_MATCH
@ -46,10 +46,15 @@ class BaseDefaults(object):
@classmethod @classmethod
def create_lookups(cls, nlp=None): def create_lookups(cls, nlp=None):
root_path = util.get_module_path(cls) root = util.get_module_path(cls)
filenames = {name: root / filename for name, filename in cls.resources}
if LANG in cls.lex_attr_getters:
lang = cls.lex_attr_getters[LANG](None)
user_lookups = util.get_entry_point(util.ENTRY_POINTS.lookups, lang, {})
filenames.update(user_lookups)
lookups = Lookups() lookups = Lookups()
for name, filename in cls.resources.items(): for name, filename in filenames.items():
data = util.load_language_data(root_path / filename) data = util.load_language_data(filename)
lookups.add_table(name, data) lookups.add_table(name, data)
return lookups return lookups
@ -168,7 +173,7 @@ class Language(object):
100,000 characters in one text. 100,000 characters in one text.
RETURNS (Language): The newly constructed object. RETURNS (Language): The newly constructed object.
""" """
user_factories = util.get_entry_points("spacy_factories") user_factories = util.get_entry_points(util.ENTRY_POINTS.factories)
self.factories.update(user_factories) self.factories.update(user_factories)
self._meta = dict(meta) self._meta = dict(meta)
self._path = None self._path = None

View File

@ -140,13 +140,6 @@ def lt_tokenizer():
return get_lang_class("lt").Defaults.create_tokenizer() return get_lang_class("lt").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def lt_lemmatizer():
lang_cls = get_lang_class("lt")
lookups = lang_cls.Defaults.create_lookups()
return lang_cls.Defaults.create_lemmatizer(lookups=lookups)
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def nb_tokenizer(): def nb_tokenizer():
return get_lang_class("nb").Defaults.create_tokenizer() return get_lang_class("nb").Defaults.create_tokenizer()
@ -157,13 +150,6 @@ def nl_tokenizer():
return get_lang_class("nl").Defaults.create_tokenizer() return get_lang_class("nl").Defaults.create_tokenizer()
@pytest.fixture
def nl_lemmatizer(scope="session"):
lang_cls = get_lang_class("nl")
lookups = lang_cls.Defaults.create_lookups()
return lang_cls.Defaults.create_lemmatizer(lookups=lookups)
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def pl_tokenizer(): def pl_tokenizer():
return get_lang_class("pl").Defaults.create_tokenizer() return get_lang_class("pl").Defaults.create_tokenizer()

View File

@ -1,18 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"string,lemma",
[
("affaldsgruppernes", "affaldsgruppe"),
("detailhandelsstrukturernes", "detailhandelsstruktur"),
("kolesterols", "kolesterol"),
("åsyns", "åsyn"),
],
)
def test_da_lemmatizer_lookup_assigns(da_tokenizer, string, lemma):
tokens = da_tokenizer(string)
assert tokens[0].lemma_ == lemma

View File

@ -1,20 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"string,lemma",
[
("Abgehängten", "Abgehängte"),
("engagierte", "engagieren"),
("schließt", "schließen"),
("vorgebenden", "vorgebend"),
("die", "der"),
("Die", "der"),
],
)
def test_de_lemmatizer_lookup_assigns(de_tokenizer, string, lemma):
tokens = de_tokenizer(string)
assert tokens[0].lemma_ == lemma

View File

@ -124,9 +124,3 @@ def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm): def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
assert tokens[0].norm_ == norm assert tokens[0].norm_ == norm
@pytest.mark.parametrize("text", ["faster", "fastest", "better", "best"])
def test_en_lemmatizer_handles_irreg_adverbs(en_tokenizer, text):
tokens = en_tokenizer(text)
assert tokens[0].lemma_ in ["fast", "well"]

View File

@ -1,30 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
def test_fr_lemmatizer_verb(fr_tokenizer):
tokens = fr_tokenizer("Qu'est-ce que tu fais?")
assert tokens[0].lemma_ == "que"
assert tokens[1].lemma_ == "être"
assert tokens[5].lemma_ == "faire"
def test_fr_lemmatizer_noun_verb_2(fr_tokenizer):
tokens = fr_tokenizer("Les abaissements de température sont gênants.")
assert tokens[4].lemma_ == "être"
@pytest.mark.xfail(
reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN"
)
def test_fr_lemmatizer_noun(fr_tokenizer):
tokens = fr_tokenizer("il y a des Costaricienne.")
assert tokens[4].lemma_ == "Costaricain"
def test_fr_lemmatizer_noun_2(fr_tokenizer):
tokens = fr_tokenizer("Les abaissements de température sont gênants.")
assert tokens[1].lemma_ == "abaissement"
assert tokens[5].lemma_ == "gênant"

View File

@ -1,20 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"string,lemma",
[
("trčao", "trčati"),
("adekvatnim", "adekvatan"),
("dekontaminacijama", "dekontaminacija"),
("filologovih", "filologov"),
("je", "biti"),
("se", "sebe"),
],
)
def test_hr_lemmatizer_lookup_assigns(hr_tokenizer, string, lemma):
tokens = hr_tokenizer(string)
assert tokens[0].lemma_ == lemma

View File

@ -1,20 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
# fmt: off
TEST_CASES = [
(["Galime", "vadinti", "gerovės", "valstybe", ",", "turime", "išvystytą", "socialinę", "apsaugą", ",",
"sveikatos", "apsaugą", "ir", "prieinamą", "švietimą", "."],
["galėti", "vadintas", "gerovė", "valstybė", ",", "turėti", "išvystytas", "socialinis",
"apsauga", ",", "sveikata", "apsauga", "ir", "prieinamas", "švietimas", "."]),
(["taip", ",", "uoliai", "tyrinėjau", "ir", "pasirinkau", "geriausią", "variantą", "."],
["taip", ",", "uolus", "tyrinėti", "ir", "pasirinkti", "geras", "variantas", "."])
]
# fmt: on
@pytest.mark.parametrize("tokens,lemmas", TEST_CASES)
def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas):
assert lemmas == [lt_lemmatizer.lookup_table.get(token, token) for token in tokens]

View File

@ -1,143 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
# Calling the Lemmatizer directly
# Imitates behavior of:
# Tagger.set_annotations()
# -> vocab.morphology.assign_tag_id()
# -> vocab.morphology.assign_tag_id()
# -> Token.tag.__set__
# -> vocab.morphology.assign_tag(...)
# -> ... -> Morphology.assign_tag(...)
# -> self.lemmatize(analysis.tag.pos, token.lex.orth,
noun_irreg_lemmatization_cases = [
("volkeren", "volk"),
("vaatje", "vat"),
("verboden", "verbod"),
("ijsje", "ijsje"),
("slagen", "slag"),
("verdragen", "verdrag"),
("verloven", "verlof"),
("gebeden", "gebed"),
("gaten", "gat"),
("staven", "staf"),
("aquariums", "aquarium"),
("podia", "podium"),
("holen", "hol"),
("lammeren", "lam"),
("bevelen", "bevel"),
("wegen", "weg"),
("moeilijkheden", "moeilijkheid"),
("aanwezigheden", "aanwezigheid"),
("goden", "god"),
("loten", "lot"),
("kaarsen", "kaars"),
("leden", "lid"),
("glaasje", "glas"),
("eieren", "ei"),
("vatten", "vat"),
("kalveren", "kalf"),
("padden", "pad"),
("smeden", "smid"),
("genen", "gen"),
("beenderen", "been"),
]
verb_irreg_lemmatization_cases = [
("liep", "lopen"),
("hief", "heffen"),
("begon", "beginnen"),
("sla", "slaan"),
("aangekomen", "aankomen"),
("sproot", "spruiten"),
("waart", "zijn"),
("snoof", "snuiven"),
("spoot", "spuiten"),
("ontbeet", "ontbijten"),
("gehouwen", "houwen"),
("afgewassen", "afwassen"),
("deed", "doen"),
("schoven", "schuiven"),
("gelogen", "liegen"),
("woog", "wegen"),
("gebraden", "braden"),
("smolten", "smelten"),
("riep", "roepen"),
("aangedaan", "aandoen"),
("vermeden", "vermijden"),
("stootten", "stoten"),
("ging", "gaan"),
("geschoren", "scheren"),
("gesponnen", "spinnen"),
("reden", "rijden"),
("zochten", "zoeken"),
("leed", "lijden"),
("verzonnen", "verzinnen"),
]
@pytest.mark.parametrize("text,lemma", noun_irreg_lemmatization_cases)
def test_nl_lemmatizer_noun_lemmas_irreg(nl_lemmatizer, text, lemma):
pos = "noun"
lemmas_pred = nl_lemmatizer(text, pos)
assert lemma == sorted(lemmas_pred)[0]
@pytest.mark.parametrize("text,lemma", verb_irreg_lemmatization_cases)
def test_nl_lemmatizer_verb_lemmas_irreg(nl_lemmatizer, text, lemma):
pos = "verb"
lemmas_pred = nl_lemmatizer(text, pos)
assert lemma == sorted(lemmas_pred)[0]
@pytest.mark.skip
@pytest.mark.parametrize("text,lemma", [])
def test_nl_lemmatizer_verb_lemmas_reg(nl_lemmatizer, text, lemma):
# TODO: add test
pass
@pytest.mark.skip
@pytest.mark.parametrize("text,lemma", [])
def test_nl_lemmatizer_adjective_lemmas(nl_lemmatizer, text, lemma):
# TODO: add test
pass
@pytest.mark.skip
@pytest.mark.parametrize("text,lemma", [])
def test_nl_lemmatizer_determiner_lemmas(nl_lemmatizer, text, lemma):
# TODO: add test
pass
@pytest.mark.skip
@pytest.mark.parametrize("text,lemma", [])
def test_nl_lemmatizer_adverb_lemmas(nl_lemmatizer, text, lemma):
# TODO: add test
pass
@pytest.mark.parametrize("text,lemma", [])
def test_nl_lemmatizer_pronoun_lemmas(nl_lemmatizer, text, lemma):
# TODO: add test
pass
# Using the lemma lookup table only
@pytest.mark.parametrize("text,lemma", noun_irreg_lemmatization_cases)
def test_nl_lemmatizer_lookup_noun(nl_lemmatizer, text, lemma):
lemma_pred = nl_lemmatizer.lookup(text)
assert lemma_pred in (lemma, text)
@pytest.mark.parametrize("text,lemma", verb_irreg_lemmatization_cases)
def test_nl_lemmatizer_lookup_verb(nl_lemmatizer, text, lemma):
lemma_pred = nl_lemmatizer.lookup(text)
assert lemma_pred in (lemma, text)

View File

@ -1,18 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"string,lemma",
[
("câini", "câine"),
("expedițiilor", "expediție"),
("pensete", "pensetă"),
("erau", "fi"),
],
)
def test_ro_lemmatizer_lookup_assigns(ro_tokenizer, string, lemma):
tokens = ro_tokenizer(string)
assert tokens[0].lemma_ == lemma

View File

@ -1,20 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"string,lemma",
[
("најадекватнији", "адекватан"),
("матурирао", "матурирати"),
("планираћемо", "планирати"),
("певају", "певати"),
("нама", "ми"),
("се", "себе"),
],
)
def test_sr_lemmatizer_lookup_assigns(sr_tokenizer, string, lemma):
tokens = sr_tokenizer(string)
assert tokens[0].lemma_ == lemma

View File

@ -1,20 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"string,lemma",
[
("DNA-profilernas", "DNA-profil"),
("Elfenbenskustens", "Elfenbenskusten"),
("abortmotståndarens", "abortmotståndare"),
("kolesterols", "kolesterol"),
("portionssnusernas", "portionssnus"),
("åsyns", "åsyn"),
],
)
def test_lemmatizer_lookup_assigns(sv_tokenizer, string, lemma):
tokens = sv_tokenizer(string)
assert tokens[0].lemma_ == lemma

View File

@ -1,21 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"string,lemma",
[
("evlerimizdeki", "ev"),
("işlerimizi", ""),
("biran", "biran"),
("bitirmeliyiz", "bitir"),
("isteklerimizi", "istek"),
("karşılaştırmamızın", "karşılaştır"),
("çoğulculuktan", "çoğulcu"),
],
)
def test_tr_lemmatizer_lookup_assigns(tr_tokenizer, string, lemma):
tokens = tr_tokenizer(string)
assert tokens[0].lemma_ == lemma

View File

@ -1,15 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from ..util import get_doc
def test_issue4104(en_vocab):
"""Test that English lookup lemmatization of spun & dry are correct
expected mapping = {'dry': 'dry', 'spun': 'spin', 'spun-dry': 'spin-dry'}
"""
text = "dry spun spun-dry"
doc = get_doc(en_vocab, [t for t in text.split(" ")])
# using a simple list to preserve order
expected = ["dry", "spin", "spin-dry"]
assert [token.lemma_ for token in doc] == expected

View File

@ -37,6 +37,15 @@ _data_path = Path(__file__).parent / "data"
_PRINT_ENV = False _PRINT_ENV = False
class ENTRY_POINTS(object):
"""Available entry points to register extensions."""
factories = "spacy_factories"
languages = "spacy_languages"
displacy_colors = "spacy_displacy_colors"
lookups = "spacy_lookups"
def set_env_log(value): def set_env_log(value):
global _PRINT_ENV global _PRINT_ENV
_PRINT_ENV = value _PRINT_ENV = value
@ -62,7 +71,7 @@ def get_lang_class(lang):
""" """
global LANGUAGES global LANGUAGES
# Check if an entry point is exposed for the language code # Check if an entry point is exposed for the language code
entry_point = get_entry_point("spacy_languages", lang) entry_point = get_entry_point(ENTRY_POINTS.languages, lang)
if entry_point is not None: if entry_point is not None:
LANGUAGES[lang] = entry_point LANGUAGES[lang] = entry_point
return entry_point return entry_point
@ -278,17 +287,19 @@ def get_entry_points(key):
return result return result
def get_entry_point(key, value): def get_entry_point(key, value, default=None):
"""Check if registered entry point is available for a given name and """Check if registered entry point is available for a given name and
load it. Otherwise, return None. load it. Otherwise, return None.
key (unicode): Entry point name. key (unicode): Entry point name.
value (unicode): Name of entry point to load. value (unicode): Name of entry point to load.
default: Optional default value to return.
RETURNS: The loaded entry point or None. RETURNS: The loaded entry point or None.
""" """
for entry_point in pkg_resources.iter_entry_points(key): for entry_point in pkg_resources.iter_entry_points(key):
if entry_point.name == value: if entry_point.name == value:
return entry_point.load() return entry_point.load()
return default
def is_in_jupyter(): def is_in_jupyter():