Tidy up and move noun_chunks, token_match, url_match

This commit is contained in:
Ines Montani 2020-07-22 22:18:46 +02:00
parent 7fc4dadd22
commit b507f61629
82 changed files with 373 additions and 4899 deletions

View File

@ -3,10 +3,13 @@ lang = null
stop_words = []
lex_attr_getters = {}
vocab_data = {}
get_noun_chunks = null
pipeline = []
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
token_match = null
url_match = {"@language_data": "spacy.xx.url_match"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"

View File

@ -4,11 +4,9 @@ from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -35,7 +33,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class ArabicDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
suffixes = TOKENIZER_SUFFIXES

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA
from ...util import update_exc
_exc = {}
@ -43,4 +45,4 @@ for exc_data in [
for exc_data in [{LEMMA: "تلفون", ORTH: "ت."}, {LEMMA: "صندوق بريد", ORTH: "ص.ب"}]:
_exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,9 +4,8 @@ from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -30,7 +29,7 @@ def stop_words() -> Set[str]:
class BengaliDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA
from ...util import update_exc
_exc = {}
@ -21,4 +23,4 @@ for exc_data in [
_exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,9 +4,8 @@ from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
from .punctuation import TOKENIZER_INFIXES
@ -37,7 +36,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class CatalanDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA
from ...util import update_exc
_exc = {}
@ -35,4 +37,4 @@ for h in range(1, 12 + 1):
_exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -42,7 +41,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class DanishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES

View File

@ -2,7 +2,9 @@
Tokenizer Exceptions.
Source: https://forkortelse.dk/ and various others.
"""
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM
from ...util import update_exc
_exc = {}
@ -576,4 +578,4 @@ for h in range(1, 31 + 1):
_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: "."}]}
_exc.update(_custom_base_exc)
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,20 +1,20 @@
from typing import Set
from typing import Set, Callable
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .syntax_iterators import noun_chunks
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "de"
stop_words = {"@language_data": "spacy.de.stop_words"}
get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
@ -36,12 +36,16 @@ def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.de.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class GermanDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS
class German(Language):

View File

@ -1,39 +1,26 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# this iterator extracts spans headed by NOUNs starting from the left-most
# syntactic dependent until the NOUN itself for close apposition and
# measurement construction, the span is sometimes extended to the right of
# the NOUN. Example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee"
# and not just "eine Tasse", same for "das Thema Familie".
labels = [
"sb",
"oa",
"da",
"nk",
"mo",
"ag",
"ROOT",
"root",
"cj",
"pd",
"og",
"app",
]
# fmt: off
labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"]
# fmt: on
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
np_label = doc.vocab.strings.add("NP")
np_deps = set(doc.vocab.strings.add(label) for label in labels)
close_app = doc.vocab.strings.add("nk")
rbracket = 0
for i, word in enumerate(doclike):
if i < rbracket:

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
from ...util import update_exc
_exc = {
@ -254,4 +256,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

9
spacy/lang/defaults.py Normal file
View File

@ -0,0 +1,9 @@
from typing import Pattern
from .tokenizer_exceptions import URL_MATCH
from ..util import registry
@registry.language_data("spacy.xx.url_match")
def url_match() -> Pattern:
return URL_MATCH

View File

@ -5,11 +5,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import GreekLemmatizer
from .syntax_iterators import SYNTAX_ITERATORS
from .syntax_iterators import noun_chunks
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -17,6 +16,7 @@ DEFAULT_CONFIG = """
lang = "el"
stop_words = {"@language_data": "spacy.el.stop_words"}
lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.el.get_noun_chunks"}
[nlp.lemmatizer]
@lemmatizers = "spacy.GreekLemmatizer.v1"
@ -38,6 +38,11 @@ def create_greek_lemmatizer(data: Dict[str, dict] = {}) -> GreekLemmatizer:
return GreekLemmatizer(data=data)
@registry.language_data("spacy.el.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
@registry.language_data("spacy.el.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@ -49,11 +54,10 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class GreekDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS
class Greek(Language):

View File

@ -1,21 +1,20 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike):
"""
Detect base noun phrases. Works on both Doc and Span.
"""
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# It follows the logic of the noun chunks finder of English language,
# adjusted to some Greek language special characteristics.
# obj tag corrects some DEP tagger mistakes.
# Further improvement of the models will eliminate the need for this tag.
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj")
nmod = doc.vocab.strings.add("nmod")

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM
from ...util import update_exc
_exc = {}
@ -392,4 +393,4 @@ for orth in [
]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,13 +4,12 @@ from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .syntax_iterators import noun_chunks
from .lemmatizer import is_base_form
from .punctuation import TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...lemmatizer import Lemmatizer
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -18,6 +17,7 @@ DEFAULT_CONFIG = """
lang = "en"
stop_words = {"@language_data": "spacy.en.stop_words"}
lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.en.get_noun_chunks"}
[nlp.lemmatizer]
@lemmatizers = "spacy.EnglishLemmatizer.v1"
@ -49,9 +49,13 @@ def create_lemmatizer(data: Dict[str, dict] = {}) -> "Lemmatizer":
return Lemmatizer(data=data, is_base_form=is_base_form)
@registry.language_data("spacy.en.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class EnglishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
syntax_iterators = SYNTAX_ITERATORS
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES

View File

@ -1,27 +1,18 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = [
"nsubj",
"dobj",
"nsubjpass",
"pcomp",
"pobj",
"dative",
"appos",
"attr",
"ROOT",
]
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# fmt: off
labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"]
# fmt: on
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
from ...util import update_exc
_exc = {}
@ -555,4 +557,4 @@ for string in _exclude:
_exc.pop(string)
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,11 +4,10 @@ from thinc.config import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .syntax_iterators import noun_chunks
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -16,6 +15,7 @@ DEFAULT_CONFIG = """
lang = "es"
stop_words = {"@language_data": "spacy.es.stop_words"}
lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.es.get_noun_chunks"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
@ -32,6 +32,11 @@ tables = ["lexeme_cluster", "lexeme_prob", "lexeme_settings"]
"""
@registry.language_data("spacy.es.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
@registry.language_data("spacy.es.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@ -43,10 +48,9 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class SpanishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
syntax_iterators = SYNTAX_ITERATORS
class Spanish(Language):

View File

@ -1,13 +1,15 @@
from typing import Union, Iterator, Optional, List, Tuple
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
from ...errors import Errors
from ...tokens import Doc, Span, Token
def noun_chunks(doclike):
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
doc = doclike.doc
if not doc.is_parsed:
raise ValueError(Errors.E029)
if not len(doc):
return
np_label = doc.vocab.strings.add("NP")
@ -28,18 +30,24 @@ def noun_chunks(doclike):
token = next_token(token)
def is_verb_token(token):
def is_verb_token(token: Token) -> bool:
return token.pos in [VERB, AUX]
def next_token(token):
def next_token(token: Token) -> Optional[Token]:
try:
return token.nbor()
except IndexError:
return None
def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
def noun_bounds(
doc: Doc,
root: Token,
np_left_deps: List[str],
np_right_deps: List[str],
stop_deps: List[str],
) -> Tuple[Token, Token]:
left_bound = root
for token in reversed(list(root.lefts)):
if token.dep in np_left_deps:
@ -50,12 +58,8 @@ def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
left, right = noun_bounds(
doc, token, np_left_deps, np_right_deps, stop_deps
)
if list(
filter(
lambda t: is_verb_token(t) or t.dep in stop_deps,
doc[left_bound.i : right.i],
)
):
filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
if list(filter(filter_func, doc[left_bound.i : right.i],)):
break
else:
right_bound = right

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA
from ...util import update_exc
_exc = {
@ -73,4 +75,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -2,12 +2,12 @@ from typing import Set, Dict, Callable, Any
from thinc.api import Config
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_SUFFIXES
from .syntax_iterators import SYNTAX_ITERATORS
from .syntax_iterators import noun_chunks
DEFAULT_CONFIG = """
@ -15,6 +15,7 @@ DEFAULT_CONFIG = """
lang = "fa"
stop_words = {"@language_data": "spacy.fa.stop_words"}
lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"}
[nlp.writing_system]
direction = "rtl"
@ -41,10 +42,14 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.language_data("spacy.fa.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class PersianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
suffixes = TOKENIZER_SUFFIXES
syntax_iterators = SYNTAX_ITERATORS
class Persian(Language):

View File

@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -31,7 +30,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class FinnishDefaults(Language.Defaults):
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
class Finnish(Language):

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA
from ...util import update_exc
_exc = {}
@ -78,4 +80,4 @@ for exc_data in [
_exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,4 +1,4 @@
from typing import Set, Dict, Callable, Any
from typing import Set, Dict, Callable, Any, Pattern
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
@ -7,10 +7,9 @@ from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import FrenchLemmatizer, is_base_form
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .syntax_iterators import noun_chunks
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -18,6 +17,11 @@ DEFAULT_CONFIG = """
lang = "fr"
stop_words = {"@language_data": "spacy.fr.stop_words"}
lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.fr.get_noun_chunks"}
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
token_match = {"@language_data": "spacy.fr.token_match"}
[nlp.lemmatizer]
@lemmatizers = "spacy.FrenchLemmatizer.v1"
@ -34,6 +38,11 @@ def create_french_lemmatizer(data: Dict[str, dict] = {}) -> FrenchLemmatizer:
return FrenchLemmatizer(data=data, is_base_form=is_base_form)
@registry.language_data("spacy.fr.token_match")
def token_match() -> Pattern:
return TOKEN_MATCH
@registry.language_data("spacy.fr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@ -44,13 +53,16 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.language_data("spacy.fr.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class FrenchDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
token_match = TOKEN_MATCH
syntax_iterators = SYNTAX_ITERATORS
class French(Language):

View File

@ -1,26 +1,18 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = [
"nsubj",
"nsubj:pass",
"obj",
"iobj",
"ROOT",
"appos",
"nmod",
"nmod:poss",
]
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# fmt: off
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
# fmt: on
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")

View File

@ -1,8 +1,11 @@
import re
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .punctuation import ELISION, HYPHENS
from ..char_classes import ALPHA_LOWER, ALPHA
from ...symbols import ORTH, LEMMA
from ...util import update_exc
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
@ -452,7 +455,7 @@ _regular_exp += [
]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
TOKEN_MATCH = re.compile(
"(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
).match

View File

@ -3,9 +3,8 @@ from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -21,7 +20,7 @@ def stop_words() -> Set[str]:
class IrishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
class Irish(Language):

View File

@ -1,5 +1,7 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX
from ...symbols import ORTH, LEMMA, NORM
from ...util import update_exc
_exc = {
@ -81,4 +83,4 @@ for orth in ["d'", "D'"]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,7 +4,7 @@ from thinc.api import Config
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -25,7 +25,7 @@ def stop_words() -> Set[str]:
class HebrewDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
tokenizer_exceptions = BASE_EXCEPTIONS
class Hebrew(Language):

View File

@ -4,7 +4,7 @@ from thinc.api import Config
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -28,7 +28,7 @@ def stop_words() -> Set[str]:
class CroatianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
tokenizer_exceptions = BASE_EXCEPTIONS
class Croatian(Language):

View File

@ -1,12 +1,11 @@
from typing import Set
from typing import Set, Pattern
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -14,6 +13,10 @@ DEFAULT_CONFIG = """
lang = "hu"
stop_words = {"@language_data": "spacy.hu.stop_words"}
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
token_match = {"@language_data": "spacy.hu.token_match"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
@ -29,12 +32,16 @@ def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.hu.token_match")
def token_match() -> Pattern:
return TOKEN_MATCH
class HungarianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
token_match = TOKEN_MATCH
class Hungarian(Language):

View File

@ -1,7 +1,9 @@
import re
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..punctuation import ALPHA_LOWER, CURRENCY
from ...symbols import ORTH
from ...util import update_exc
_exc = {}
@ -644,5 +646,5 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
)
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match

View File

@ -5,10 +5,9 @@ from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .syntax_iterators import noun_chunks
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -16,6 +15,7 @@ DEFAULT_CONFIG = """
lang = "id"
stop_words = {"@language_data": "spacy.id.stop_words"}
lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.id.get_noun_chunks"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
@ -42,12 +42,16 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.language_data("spacy.id.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class IndonesianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS
class Indonesian(Language):

View File

@ -1,26 +1,20 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike):
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = [
"nsubj",
"nsubj:pass",
"obj",
"iobj",
"ROOT",
"appos",
"nmod",
"nmod:poss",
]
# fmt: off
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
# fmt: on
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")

View File

@ -1,5 +1,8 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM
from ...util import update_exc
# Daftar singkatan dan Akronim dari:
# https://id.wiktionary.org/wiki/Wiktionary:Daftar_singkatan_dan_akronim_bahasa_Indonesia#A
@ -221,4 +224,4 @@ for orth in [
]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,9 +4,8 @@ from thinc.api import Config
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -30,7 +29,7 @@ def stop_words() -> Set[str]:
class ItalianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES

View File

@ -1,4 +1,7 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA
from ...util import update_exc
_exc = {
"all'art.": [{ORTH: "all'"}, {ORTH: "art."}],
@ -52,4 +55,4 @@ for orth in [
]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,11 +1,11 @@
from typing import Optional, Union, Dict, Any, Set
from typing import Optional, Union, Dict, Any, Set, Callable
from pathlib import Path
import srsly
from collections import namedtuple
from thinc.api import Config
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .syntax_iterators import noun_chunks
from .tag_map import TAG_MAP
from .tag_orth_map import TAG_ORTH_MAP
from .tag_bigram_map import TAG_BIGRAM_MAP
@ -22,6 +22,7 @@ DEFAULT_CONFIG = """
[nlp]
lang = "ja"
stop_words = {"@language_data": "spacy.ja.stop_words"}
get_noun_chunks = {"@language_data": "spacy.ja.get_noun_chunks"}
[nlp.tokenizer]
@tokenizers = "spacy.JapaneseTokenizer.v1"
@ -39,6 +40,11 @@ def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ja.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
@registry.tokenizers("spacy.JapaneseTokenizer.v1")
def create_japanese_tokenizer(split_mode: Optional[str] = None):
def japanese_tokenizer_factory(nlp):
@ -50,6 +56,8 @@ def create_japanese_tokenizer(split_mode: Optional[str] = None):
class JapaneseTokenizer(DummyTokenizer):
def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
self.vocab = nlp.vocab
# TODO: is this the right way to do it?
self.vocab.morphology.load_tag_map(TAG_MAP)
self.split_mode = split_mode
self.tokenizer = try_sudachi_import(self.split_mode)
@ -171,14 +179,8 @@ class JapaneseTokenizer(DummyTokenizer):
return self
class JapaneseDefaults(Language.Defaults):
tag_map = TAG_MAP
syntax_iterators = SYNTAX_ITERATORS
class Japanese(Language):
lang = "ja"
Defaults = JapaneseDefaults
default_config = Config().from_str(DEFAULT_CONFIG)

View File

@ -1,33 +1,23 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON, VERB
# XXX this can probably be pruned a bit
labels = [
"nsubj",
"nmod",
"dobj",
"nsubjpass",
"pcomp",
"pobj",
"obj",
"obl",
"dative",
"appos",
"attr",
"ROOT",
]
from ...tokens import Doc, Span
def noun_chunks(obj):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
# TODO: this can probably be pruned a bit
# fmt: off
labels = ["nsubj", "nmod", "ddoclike", "nsubjpass", "pcomp", "pdoclike", "doclike", "obl", "dative", "appos", "attr", "ROOT"]
# fmt: on
doc = obj.doc # Ensure works on both Doc and Span.
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
doc = doclike.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings.add(label) for label in labels]
doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
for i, word in enumerate(obj):
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
@ -37,12 +27,10 @@ def noun_chunks(obj):
unseen = [w.i for w in word.subtree if w.i not in seen]
if not unseen:
continue
# this takes care of particles etc.
seen.update(j.i for j in word.subtree)
# This avoids duplicating embedded clauses
seen.update(range(word.i + 1))
# if the head of this is a verb, mark that and rights seen
# Don't do the subtree as that can hide other phrases
if word.head.pos == VERB:

View File

@ -40,6 +40,8 @@ def create_korean_tokenizer():
class KoreanTokenizer(DummyTokenizer):
def __init__(self, nlp: Optional[Language] = None):
self.vocab = nlp.vocab
# TODO: is this the right way to do it?
self.vocab.morphology.load_tag_map(TAG_MAP)
MeCab = try_mecab_import()
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
@ -72,13 +74,8 @@ class KoreanTokenizer(DummyTokenizer):
yield {"surface": surface, "lemma": lemma, "tag": tag}
class KoreanDefaults(Language.Defaults):
tag_map = TAG_MAP
class Korean(Language):
lang = "ko"
Defaults = KoreanDefaults
default_config = Config().from_str(DEFAULT_CONFIG)

View File

@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -42,7 +41,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class LuxembourgishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES

View File

@ -1,4 +1,7 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM
from ...util import update_exc
# TODO
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
@ -47,4 +50,4 @@ for orth in [
]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,9 +4,8 @@ from thinc.api import Config
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -22,7 +21,7 @@ def stop_words() -> Set[str]:
class LigurianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES

View File

@ -1,4 +1,7 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA
from ...util import update_exc
_exc = {}
@ -47,4 +50,4 @@ for prep, prep_lemma in [
{ORTH: prep, LEMMA: prep_lemma},
]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -5,9 +5,8 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -39,11 +38,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class LithuanianDefaults(Language.Defaults):
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
mod_base_exceptions = {
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
}
del mod_base_exceptions["8)"]
tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
class Lithuanian(Language):

View File

@ -1,267 +1,15 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
_exc = {}
for orth in [
"n-tosios",
"?!",
# "G.",
# "J. E.",
# "J. Em.",
# "J.E.",
# "J.Em.",
# "K.",
# "N.",
# "V.",
# "Vt.",
# "a.",
# "a.k.",
# "a.s.",
# "adv.",
# "akad.",
# "aklg.",
# "akt.",
# "al.",
# "ang.",
# "angl.",
# "aps.",
# "apskr.",
# "apyg.",
# "arbat.",
# "asist.",
# "asm.",
# "asm.k.",
# "asmv.",
# "atk.",
# "atsak.",
# "atsisk.",
# "atsisk.sąsk.",
# "atv.",
# "aut.",
# "avd.",
# "b.k.",
# "baud.",
# "biol.",
# "bkl.",
# "bot.",
# "bt.",
# "buv.",
# "ch.",
# "chem.",
# "corp.",
# "d.",
# "dab.",
# "dail.",
# "dek.",
# "deš.",
# "dir.",
# "dirig.",
# "doc.",
# "dol.",
# "dr.",
# "drp.",
# "dvit.",
# "dėst.",
# "dš.",
# "dž.",
# "e.b.",
# "e.bankas",
# "e.p.",
# "e.parašas",
# "e.paštas",
# "e.v.",
# "e.valdžia",
# "egz.",
# "eil.",
# "ekon.",
# "el.",
# "el.bankas",
# "el.p.",
# "el.parašas",
# "el.paštas",
# "el.valdžia",
# "etc.",
# "ež.",
# "fak.",
# "faks.",
# "feat.",
# "filol.",
# "filos.",
# "g.",
# "gen.",
# "geol.",
# "gerb.",
# "gim.",
# "gr.",
# "gv.",
# "gyd.",
# "gyv.",
# "habil.",
# "inc.",
# "insp.",
# "inž.",
# "ir pan.",
# "ir t. t.",
# "isp.",
# "istor.",
# "it.",
# "just.",
# "k.",
# "k. a.",
# "k.a.",
# "kab.",
# "kand.",
# "kart.",
# "kat.",
# "ketv.",
# "kh.",
# "kl.",
# "kln.",
# "km.",
# "kn.",
# "koresp.",
# "kpt.",
# "kr.",
# "kt.",
# "kub.",
# "kun.",
# "kv.",
# "kyš.",
# "l. e. p.",
# "l.e.p.",
# "lenk.",
# "liet.",
# "lot.",
# "lt.",
# "ltd.",
# "ltn.",
# "m.",
# "m.e..",
# "m.m.",
# "mat.",
# "med.",
# "mgnt.",
# "mgr.",
# "min.",
# "mjr.",
# "ml.",
# "mln.",
# "mlrd.",
# "mob.",
# "mok.",
# "moksl.",
# "mokyt.",
# "mot.",
# "mr.",
# "mst.",
# "mstl.",
# "mėn.",
# "nkt.",
# "no.",
# "nr.",
# "ntk.",
# "nuotr.",
# "op.",
# "org.",
# "orig.",
# "p.",
# "p.d.",
# "p.m.e.",
# "p.s.",
# "pab.",
# "pan.",
# "past.",
# "pav.",
# "pavad.",
# "per.",
# "perd.",
# "pirm.",
# "pl.",
# "plg.",
# "plk.",
# "pr.",
# "pr.Kr.",
# "pranc.",
# "proc.",
# "prof.",
# "prom.",
# "prot.",
# "psl.",
# "pss.",
# "pvz.",
# "pšt.",
# "r.",
# "raj.",
# "red.",
# "rez.",
# "rež.",
# "rus.",
# "rš.",
# "s.",
# "sav.",
# "saviv.",
# "sek.",
# "sekr.",
# "sen.",
# "sh.",
# "sk.",
# "skg.",
# "skv.",
# "skyr.",
# "sp.",
# "spec.",
# "sr.",
# "st.",
# "str.",
# "stud.",
# "sąs.",
# "t.",
# "t. p.",
# "t. y.",
# "t.p.",
# "t.t.",
# "t.y.",
# "techn.",
# "tel.",
# "teol.",
# "th.",
# "tir.",
# "trit.",
# "trln.",
# "tšk.",
# "tūks.",
# "tūkst.",
# "up.",
# "upl.",
# "v.s.",
# "vad.",
# "val.",
# "valg.",
# "ved.",
# "vert.",
# "vet.",
# "vid.",
# "virš.",
# "vlsč.",
# "vnt.",
# "vok.",
# "vs.",
# "vtv.",
# "vv.",
# "vyr.",
# "vyresn.",
# "zool.",
# "Įn",
# "įl.",
# "š.m.",
# "šnek.",
# "šv.",
# "švč.",
# "ž.ū.",
# "žin.",
# "žml.",
# "žr.",
]:
for orth in ["n-tosios", "?!"]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc
mod_base_exceptions = {
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
}
del mod_base_exceptions["8)"]
TOKENIZER_EXCEPTIONS = update_exc(mod_base_exceptions, _exc)

View File

@ -1,20 +1,20 @@
from typing import Set
from typing import Set, Callable
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .syntax_iterators import noun_chunks
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "nb"
stop_words = {"@language_data": "spacy.nb.stop_words"}
get_noun_chunks = {"@language_data": "spacy.nb.get_noun_chunks"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
@ -31,12 +31,16 @@ def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.nb.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class NorwegianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
syntax_iterators = SYNTAX_ITERATORS
class Norwegian(Language):

View File

@ -1,26 +1,18 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = [
"nsubj",
"nsubj:pass",
"obj",
"iobj",
"ROOT",
"appos",
"nmod",
"nmod:poss",
]
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# fmt: off
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
# fmt: on
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA
from ...util import update_exc
_exc = {}
@ -218,4 +220,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -7,9 +7,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .lemmatizer import DutchLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -44,7 +43,7 @@ def create_dutch_lemmatizer(data: Dict[str, dict] = {}) -> DutchLemmatizer:
class DutchDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES

View File

@ -1,4 +1,7 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
# Extensive list of both common and uncommon dutch abbreviations copied from
# github.com/diasks2/pragmatic_segmenter, a Ruby library for rule-based
@ -1602,4 +1605,4 @@ for orth in abbrevs:
_exc[i] = [{ORTH: i}]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,10 +4,9 @@ from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -37,7 +36,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class PortugueseDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
_exc = {}
@ -50,4 +52,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
# Lemma data note:
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
@ -35,7 +34,7 @@ def stop_words() -> Set[str]:
class RomanianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
from .punctuation import _make_ro_variants
@ -91,4 +93,4 @@ for orth in [
_exc[variant] = [{ORTH: variant}]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -5,8 +5,7 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...util import update_exc, registry
from ...util import registry
from ...language import Language
@ -42,7 +41,7 @@ def create_russian_lemmatizer() -> RussianLemmatizer:
class RussianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
class Russian(Language):

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM
from ...util import update_exc
_exc = {}
@ -63,4 +65,4 @@ for slang_desc in _slang_exc:
_exc[slang_desc[ORTH]] = [slang_desc]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,9 +4,8 @@ from thinc.api import Config
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -41,7 +40,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class SerbianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
class Serbian(Language):

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM
from ...util import update_exc
_exc = {}
@ -90,4 +92,4 @@ for slang_desc in _slang_exc:
_exc[slang_desc[ORTH]] = [slang_desc]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,10 +4,9 @@ from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from .syntax_iterators import SYNTAX_ITERATORS
from ...util import registry
from .syntax_iterators import noun_chunks
# Punctuation stolen from Danish
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
@ -18,6 +17,7 @@ DEFAULT_CONFIG = """
lang = "sv"
stop_words = {"@language_data": "spacy.sv.stop_words"}
lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.sv.get_noun_chunks"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
@ -39,11 +39,15 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.language_data("spacy.sv.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class SwedishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
syntax_iterators = SYNTAX_ITERATORS
class Swedish(Language):

View File

@ -1,27 +1,18 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = [
"nsubj",
"nsubj:pass",
"dobj",
"obj",
"iobj",
"ROOT",
"appos",
"nmod",
"nmod:poss",
]
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# fmt: off
labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
# fmt: on
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA
from ...util import update_exc
_exc = {}
@ -154,4 +156,4 @@ for orth in ABBREVIATIONS:
for orth in ["i", "m"]:
_exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: "."}]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,25 +0,0 @@
from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
TAG_MAP = {
"ADV": {POS: ADV},
"NOUN": {POS: NOUN},
"ADP": {POS: ADP},
"PRON": {POS: PRON},
"SCONJ": {POS: SCONJ},
"PROPN": {POS: PROPN},
"DET": {POS: DET},
"SYM": {POS: SYM},
"INTJ": {POS: INTJ},
"PUNCT": {POS: PUNCT},
"NUM": {POS: NUM},
"AUX": {POS: AUX},
"X": {POS: X},
"CONJ": {POS: CONJ},
"CCONJ": {POS: CCONJ},
"ADJ": {POS: ADJ},
"VERB": {POS: VERB},
"PART": {POS: PART},
"_SP": {POS: SPACE},
}

View File

@ -4,9 +4,8 @@ from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -36,7 +35,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class TagalogDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
class Tagalog(Language):

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA
from ...util import update_exc
_exc = {
@ -14,4 +16,4 @@ _exc = {
}
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -55,7 +55,6 @@ URL_PATTERN = (
# fmt: on
).strip()
TOKEN_MATCH = None
URL_MATCH = re.compile("(?u)" + URL_PATTERN).match

View File

@ -3,9 +3,8 @@ from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -29,7 +28,7 @@ def stop_words() -> Set[str]:
class TurkishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
class Turkish(Language):

View File

@ -1,4 +1,7 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
_exc = {"sağol": [{ORTH: "sağ"}, {ORTH: "ol", NORM: "olun"}]}
@ -113,4 +116,4 @@ for orth in ["Dr.", "yy."]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -5,9 +5,8 @@ from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from ...util import registry
DEFAULT_CONFIG = """
@ -29,7 +28,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class TatarDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = tuple(TOKENIZER_INFIXES)

View File

@ -1,4 +1,7 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM
from ...util import update_exc
_exc = {}
@ -43,4 +46,4 @@ for exc_data in [ # "etc." abbreviations
exc_data[LEMMA] = exc_data[NORM]
_exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,8 +4,7 @@ from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...util import update_exc, registry
from ...util import registry
from ...language import Language
from .lemmatizer import UkrainianLemmatizer
@ -37,7 +36,7 @@ def create_ukrainian_lemmatizer() -> UkrainianLemmatizer:
class UkrainianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
class Ukrainian(Language):

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, POS, NORM, NOUN
from ...util import update_exc
_exc = {}
@ -21,4 +23,4 @@ for exc_data in [
_exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -24,9 +24,7 @@ from .util import link_vectors_to_models, create_default_optimizer, registry
from .util import SimpleFrozenDict
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH, URL_MATCH
from .lang.tag_map import TAG_MAP
from .tokens import Doc, Span
from .tokens import Doc
from .errors import Errors, Warnings
from .schemas import ConfigSchema
from .git_info import GIT_VERSION
@ -37,6 +35,7 @@ from . import about
from .tokenizer import Tokenizer # noqa: F401
from .lemmatizer import Lemmatizer # noqa: F401
from .lookups import Lookups # noqa: F401
from .lang import defaults # noqa: F401
ENABLE_PIPELINE_ANALYSIS = False
@ -46,15 +45,10 @@ DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH)
class BaseDefaults:
token_match: Optional[Pattern] = TOKEN_MATCH
url_match: Pattern = URL_MATCH
prefixes: Tuple[Pattern, ...] = tuple(TOKENIZER_PREFIXES)
suffixes: Tuple[Pattern, ...] = tuple(TOKENIZER_SUFFIXES)
infixes: Tuple[Pattern, ...] = tuple(TOKENIZER_INFIXES)
tag_map: Dict[str, dict] = dict(TAG_MAP)
tokenizer_exceptions: Dict[str, List[dict]] = {}
morph_rules: Dict[str, Dict[str, dict]] = {}
syntax_iterators: Dict[str, Callable[[Union[Doc, Span]], Iterator]] = {}
class Language:
@ -114,13 +108,7 @@ class Language:
if vocab is True:
vectors_name = meta.get("vectors", {}).get("name")
vocab = Vocab.from_config(
self._config,
vectors_name=vectors_name,
# TODO: what should we do with these?
tag_map=self.Defaults.tag_map,
morph_rules=self.Defaults.morph_rules,
)
vocab = Vocab.from_config(self._config, vectors_name=vectors_name)
else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
@ -1267,15 +1255,14 @@ class Language:
lex_attr_getters = resolved["nlp"]["lex_attr_getters"]
stop_words = resolved["nlp"]["stop_words"]
vocab_data = resolved["nlp"]["vocab_data"]
get_noun_chunks = resolved["nlp"]["get_noun_chunks"]
vocab = Vocab.from_config(
filled,
lemmatizer=lemmatizer,
lex_attr_getters=lex_attr_getters,
stop_words=stop_words,
vocab_data=vocab_data,
# TODO: what should we do with these?
tag_map=cls.Defaults.tag_map,
morph_rules=cls.Defaults.morph_rules,
get_noun_chunks=get_noun_chunks,
)
nlp = cls(vocab, create_tokenizer=create_tokenizer)
pipeline = config.get("components", {})

View File

@ -243,6 +243,7 @@ class ConfigSchemaNlp(BaseModel):
stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop")
lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)")
vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables")
get_noun_chunks: Optional[Callable] = Field(..., title="Function to extract noun phrases from a Doc")
# fmt: on
class Config:

View File

@ -1,7 +1,7 @@
import numpy
from spacy.attrs import HEAD, DEP
from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root
from spacy.lang.en.syntax_iterators import SYNTAX_ITERATORS
from spacy.lang.en.syntax_iterators import noun_chunks
import pytest
@ -41,7 +41,7 @@ def test_en_noun_chunks_not_nested(en_vocab):
dtype="uint64",
),
)
doc.noun_chunks_iterator = SYNTAX_ITERATORS["noun_chunks"]
doc.noun_chunks_iterator = noun_chunks
word_occurred = {}
for chunk in doc.noun_chunks:
for word in chunk:

View File

@ -9,7 +9,7 @@ from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
cimport cython
from typing import Dict, List, Union, Pattern, Optional
from typing import Dict, List, Union, Pattern, Optional, Any
import re
import warnings
@ -32,16 +32,16 @@ def create_tokenizer(
# prefixes: Optional[List[Union[str, Pattern]]],
# suffixes: Optional[List[Union[str, Pattern]]],
# infixes: Optional[List[Union[str, Pattern]]],
# token_match: Optional[Pattern],
# url_match: Optional[Pattern],
# We currently can't validate against Pattern because that will cause
# Pydantic to parse value *as* pattern
token_match: Optional[Any] = None,
url_match: Optional[Any] = None,
) -> "Tokenizer":
def tokenizer_factory(nlp):
exceptions = nlp.Defaults.tokenizer_exceptions
prefixes = nlp.Defaults.prefixes
suffixes = nlp.Defaults.suffixes
infixes = nlp.Defaults.infixes
url_match = nlp.Defaults.url_match
token_match = nlp.Defaults.token_match
prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None

View File

@ -89,16 +89,6 @@ cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name)
return get_token_attr(token, feat_name)
def _get_chunker(lang):
try:
cls = util.get_lang_class(lang)
except ImportError:
return None
except KeyError:
return None
return cls.Defaults.syntax_iterators.get("noun_chunks")
cdef class Doc:
"""A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary
@ -212,7 +202,7 @@ cdef class Doc:
self.tensor = numpy.zeros((0,), dtype="float32")
self.user_data = {} if user_data is None else user_data
self._vector = None
self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
self.noun_chunks_iterator = self.vocab.get_noun_chunks
cdef bint has_space
if words is None and spaces is not None:
raise ValueError("words must be set if spaces is set")

View File

@ -30,6 +30,7 @@ cdef class Vocab:
cpdef public object vectors
cpdef public object lookups
cpdef public object writing_system
cpdef public object get_noun_chunks
cdef readonly int length
cdef public object data_dir
cdef public object lex_attr_getters

View File

@ -30,10 +30,10 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab
"""
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
strings=tuple(), lookups=None, vocab_data={},
def __init__(self, lex_attr_getters=None, lemmatizer=None,
strings=tuple(), lookups=None, tag_map={}, vocab_data={},
oov_prob=-20., vectors_name=None, writing_system={},
**deprecated_kwargs):
get_noun_chunks=None, **deprecated_kwargs):
"""Create the vocabulary.
lex_attr_getters (dict): A dictionary mapping attribute IDs to
@ -49,7 +49,6 @@ cdef class Vocab:
RETURNS (Vocab): The newly constructed object.
"""
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
tag_map = tag_map if tag_map is not None else {}
if lookups in (None, True, False):
lookups = Lookups()
for name, data in vocab_data.items():
@ -71,6 +70,7 @@ cdef class Vocab:
self.vectors = Vectors(name=vectors_name)
self.lookups = lookups
self.writing_system = writing_system
self.get_noun_chunks = get_noun_chunks
@property
def lang(self):
@ -424,9 +424,8 @@ cdef class Vocab:
lex_attr_getters=None,
stop_words=None,
vocab_data=None,
get_noun_chunks=None,
vectors_name=None,
tag_map=None,
morph_rules=None
):
"""Create a Vocab from a config and (currently) language defaults, i.e.
nlp.Defaults.
@ -449,6 +448,9 @@ cdef class Vocab:
if vocab_data is None:
vocab_data_cfg = {"vocab_data": config["nlp"]["vocab_data"]}
vocab_data = registry.make_from_config(vocab_data_cfg)["vocab_data"]
if get_noun_chunks is None:
noun_chunks_cfg = {"get_noun_chunks": config["nlp"]["get_noun_chunks"]}
get_noun_chunks = registry.make_from_config(noun_chunks_cfg)["get_noun_chunks"]
if lex_attr_getters is None:
lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]}
lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"]
@ -468,10 +470,8 @@ cdef class Vocab:
vocab_data=vocab_data,
lemmatizer=lemmatizer,
writing_system=writing_system,
tag_map=tag_map,
get_noun_chunks=get_noun_chunks
)
if morph_rules is not None:
vocab.morphology.load_morph_exceptions(morph_rules)
if vocab.vectors.name is None and vectors_name:
vocab.vectors.name = vectors_name
return vocab