Tidy up and move noun_chunks, token_match, url_match

This commit is contained in:
Ines Montani 2020-07-22 22:18:46 +02:00
parent 7fc4dadd22
commit b507f61629
82 changed files with 373 additions and 4899 deletions

View File

@ -3,10 +3,13 @@ lang = null
stop_words = [] stop_words = []
lex_attr_getters = {} lex_attr_getters = {}
vocab_data = {} vocab_data = {}
get_noun_chunks = null
pipeline = [] pipeline = []
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1" @tokenizers = "spacy.Tokenizer.v1"
token_match = null
url_match = {"@language_data": "spacy.xx.url_match"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"

View File

@ -4,11 +4,9 @@ from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -35,7 +33,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class ArabicDefaults(Language.Defaults): class ArabicDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA from ...symbols import ORTH, LEMMA
from ...util import update_exc
_exc = {} _exc = {}
@ -43,4 +45,4 @@ for exc_data in [
for exc_data in [{LEMMA: "تلفون", ORTH: "ت."}, {LEMMA: "صندوق بريد", ORTH: "ص.ب"}]: for exc_data in [{LEMMA: "تلفون", ORTH: "ت."}, {LEMMA: "صندوق بريد", ORTH: "ص.ب"}]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,9 +4,8 @@ from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -30,7 +29,7 @@ def stop_words() -> Set[str]:
class BengaliDefaults(Language.Defaults): class BengaliDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA from ...symbols import ORTH, LEMMA
from ...util import update_exc
_exc = {} _exc = {}
@ -21,4 +23,4 @@ for exc_data in [
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,9 +4,8 @@ from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
@ -37,7 +36,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class CatalanDefaults(Language.Defaults): class CatalanDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA from ...symbols import ORTH, LEMMA
from ...util import update_exc
_exc = {} _exc = {}
@ -35,4 +37,4 @@ for h in range(1, 12 + 1):
_exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}] _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -42,7 +41,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class DanishDefaults(Language.Defaults): class DanishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -2,7 +2,9 @@
Tokenizer Exceptions. Tokenizer Exceptions.
Source: https://forkortelse.dk/ and various others. Source: https://forkortelse.dk/ and various others.
""" """
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM from ...symbols import ORTH, LEMMA, NORM
from ...util import update_exc
_exc = {} _exc = {}
@ -576,4 +578,4 @@ for h in range(1, 31 + 1):
_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: "."}]} _custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: "."}]}
_exc.update(_custom_base_exc) _exc.update(_custom_base_exc)
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,20 +1,20 @@
from typing import Set from typing import Set, Callable
from thinc.api import Config from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import noun_chunks
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "de" lang = "de"
stop_words = {"@language_data": "spacy.de.stop_words"} stop_words = {"@language_data": "spacy.de.stop_words"}
get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
@ -36,12 +36,16 @@ def stop_words() -> Set[str]:
return STOP_WORDS return STOP_WORDS
@registry.language_data("spacy.de.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class GermanDefaults(Language.Defaults): class GermanDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS
class German(Language): class German(Language):

View File

@ -1,39 +1,26 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike): def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
# this iterator extracts spans headed by NOUNs starting from the left-most # this iterator extracts spans headed by NOUNs starting from the left-most
# syntactic dependent until the NOUN itself for close apposition and # syntactic dependent until the NOUN itself for close apposition and
# measurement construction, the span is sometimes extended to the right of # measurement construction, the span is sometimes extended to the right of
# the NOUN. Example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" # the NOUN. Example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee"
# and not just "eine Tasse", same for "das Thema Familie". # and not just "eine Tasse", same for "das Thema Familie".
labels = [ # fmt: off
"sb", labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"]
"oa", # fmt: on
"da",
"nk",
"mo",
"ag",
"ROOT",
"root",
"cj",
"pd",
"og",
"app",
]
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
np_deps = set(doc.vocab.strings.add(label) for label in labels) np_deps = set(doc.vocab.strings.add(label) for label in labels)
close_app = doc.vocab.strings.add("nk") close_app = doc.vocab.strings.add("nk")
rbracket = 0 rbracket = 0
for i, word in enumerate(doclike): for i, word in enumerate(doclike):
if i < rbracket: if i < rbracket:

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
from ...util import update_exc
_exc = { _exc = {
@ -254,4 +256,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

9
spacy/lang/defaults.py Normal file
View File

@ -0,0 +1,9 @@
from typing import Pattern
from .tokenizer_exceptions import URL_MATCH
from ..util import registry
@registry.language_data("spacy.xx.url_match")
def url_match() -> Pattern:
return URL_MATCH

View File

@ -5,11 +5,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import GreekLemmatizer from .lemmatizer import GreekLemmatizer
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import noun_chunks
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -17,6 +16,7 @@ DEFAULT_CONFIG = """
lang = "el" lang = "el"
stop_words = {"@language_data": "spacy.el.stop_words"} stop_words = {"@language_data": "spacy.el.stop_words"}
lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"} lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.el.get_noun_chunks"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.GreekLemmatizer.v1" @lemmatizers = "spacy.GreekLemmatizer.v1"
@ -38,6 +38,11 @@ def create_greek_lemmatizer(data: Dict[str, dict] = {}) -> GreekLemmatizer:
return GreekLemmatizer(data=data) return GreekLemmatizer(data=data)
@registry.language_data("spacy.el.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
@registry.language_data("spacy.el.stop_words") @registry.language_data("spacy.el.stop_words")
def stop_words() -> Set[str]: def stop_words() -> Set[str]:
return STOP_WORDS return STOP_WORDS
@ -49,11 +54,10 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class GreekDefaults(Language.Defaults): class GreekDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS
class Greek(Language): class Greek(Language):

View File

@ -1,21 +1,20 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike): def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
Detect base noun phrases. Works on both Doc and Span.
"""
# It follows the logic of the noun chunks finder of English language, # It follows the logic of the noun chunks finder of English language,
# adjusted to some Greek language special characteristics. # adjusted to some Greek language special characteristics.
# obj tag corrects some DEP tagger mistakes. # obj tag corrects some DEP tagger mistakes.
# Further improvement of the models will eliminate the need for this tag. # Further improvement of the models will eliminate the need for this tag.
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"] labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings.add(label) for label in labels] np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
nmod = doc.vocab.strings.add("nmod") nmod = doc.vocab.strings.add("nmod")

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM from ...symbols import ORTH, LEMMA, NORM
from ...util import update_exc
_exc = {} _exc = {}
@ -392,4 +393,4 @@ for orth in [
]: ]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,13 +4,12 @@ from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import noun_chunks
from .lemmatizer import is_base_form from .lemmatizer import is_base_form
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...lemmatizer import Lemmatizer from ...lemmatizer import Lemmatizer
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -18,6 +17,7 @@ DEFAULT_CONFIG = """
lang = "en" lang = "en"
stop_words = {"@language_data": "spacy.en.stop_words"} stop_words = {"@language_data": "spacy.en.stop_words"}
lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"} lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.en.get_noun_chunks"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.EnglishLemmatizer.v1" @lemmatizers = "spacy.EnglishLemmatizer.v1"
@ -49,9 +49,13 @@ def create_lemmatizer(data: Dict[str, dict] = {}) -> "Lemmatizer":
return Lemmatizer(data=data, is_base_form=is_base_form) return Lemmatizer(data=data, is_base_form=is_base_form)
@registry.language_data("spacy.en.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class EnglishDefaults(Language.Defaults): class EnglishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
syntax_iterators = SYNTAX_ITERATORS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES

View File

@ -1,27 +1,18 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike): def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
Detect base noun phrases from a dependency parse. Works on both Doc and Span. # fmt: off
""" labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"]
labels = [ # fmt: on
"nsubj",
"dobj",
"nsubjpass",
"pcomp",
"pobj",
"dative",
"appos",
"attr",
"ROOT",
]
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings.add(label) for label in labels] np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
from ...util import update_exc
_exc = {} _exc = {}
@ -555,4 +557,4 @@ for string in _exclude:
_exc.pop(string) _exc.pop(string)
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,11 +4,10 @@ from thinc.config import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import noun_chunks
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -16,6 +15,7 @@ DEFAULT_CONFIG = """
lang = "es" lang = "es"
stop_words = {"@language_data": "spacy.es.stop_words"} stop_words = {"@language_data": "spacy.es.stop_words"}
lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"} lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.es.get_noun_chunks"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
@ -32,6 +32,11 @@ tables = ["lexeme_cluster", "lexeme_prob", "lexeme_settings"]
""" """
@registry.language_data("spacy.es.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
@registry.language_data("spacy.es.stop_words") @registry.language_data("spacy.es.stop_words")
def stop_words() -> Set[str]: def stop_words() -> Set[str]:
return STOP_WORDS return STOP_WORDS
@ -43,10 +48,9 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class SpanishDefaults(Language.Defaults): class SpanishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
syntax_iterators = SYNTAX_ITERATORS
class Spanish(Language): class Spanish(Language):

View File

@ -1,13 +1,15 @@
from typing import Union, Iterator, Optional, List, Tuple
from ...symbols import NOUN, PROPN, PRON, VERB, AUX from ...symbols import NOUN, PROPN, PRON, VERB, AUX
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span, Token
def noun_chunks(doclike): def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
doc = doclike.doc doc = doclike.doc
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
if not len(doc): if not len(doc):
return return
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
@ -28,18 +30,24 @@ def noun_chunks(doclike):
token = next_token(token) token = next_token(token)
def is_verb_token(token): def is_verb_token(token: Token) -> bool:
return token.pos in [VERB, AUX] return token.pos in [VERB, AUX]
def next_token(token): def next_token(token: Token) -> Optional[Token]:
try: try:
return token.nbor() return token.nbor()
except IndexError: except IndexError:
return None return None
def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps): def noun_bounds(
doc: Doc,
root: Token,
np_left_deps: List[str],
np_right_deps: List[str],
stop_deps: List[str],
) -> Tuple[Token, Token]:
left_bound = root left_bound = root
for token in reversed(list(root.lefts)): for token in reversed(list(root.lefts)):
if token.dep in np_left_deps: if token.dep in np_left_deps:
@ -50,12 +58,8 @@ def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
left, right = noun_bounds( left, right = noun_bounds(
doc, token, np_left_deps, np_right_deps, stop_deps doc, token, np_left_deps, np_right_deps, stop_deps
) )
if list( filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
filter( if list(filter(filter_func, doc[left_bound.i : right.i],)):
lambda t: is_verb_token(t) or t.dep in stop_deps,
doc[left_bound.i : right.i],
)
):
break break
else: else:
right_bound = right right_bound = right

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA
from ...util import update_exc
_exc = { _exc = {
@ -73,4 +75,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -2,12 +2,12 @@ from typing import Set, Dict, Callable, Any
from thinc.api import Config from thinc.api import Config
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import noun_chunks
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -15,6 +15,7 @@ DEFAULT_CONFIG = """
lang = "fa" lang = "fa"
stop_words = {"@language_data": "spacy.fa.stop_words"} stop_words = {"@language_data": "spacy.fa.stop_words"}
lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"} lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"}
[nlp.writing_system] [nlp.writing_system]
direction = "rtl" direction = "rtl"
@ -41,10 +42,14 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS return LEX_ATTRS
@registry.language_data("spacy.fa.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class PersianDefaults(Language.Defaults): class PersianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
syntax_iterators = SYNTAX_ITERATORS
class Persian(Language): class Persian(Language):

View File

@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -31,7 +30,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class FinnishDefaults(Language.Defaults): class FinnishDefaults(Language.Defaults):
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
class Finnish(Language): class Finnish(Language):

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA from ...symbols import ORTH, LEMMA
from ...util import update_exc
_exc = {} _exc = {}
@ -78,4 +80,4 @@ for exc_data in [
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,4 +1,4 @@
from typing import Set, Dict, Callable, Any from typing import Set, Dict, Callable, Any, Pattern
from thinc.api import Config from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
@ -7,10 +7,9 @@ from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import FrenchLemmatizer, is_base_form from .lemmatizer import FrenchLemmatizer, is_base_form
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import noun_chunks
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -18,6 +17,11 @@ DEFAULT_CONFIG = """
lang = "fr" lang = "fr"
stop_words = {"@language_data": "spacy.fr.stop_words"} stop_words = {"@language_data": "spacy.fr.stop_words"}
lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"} lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.fr.get_noun_chunks"}
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
token_match = {"@language_data": "spacy.fr.token_match"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.FrenchLemmatizer.v1" @lemmatizers = "spacy.FrenchLemmatizer.v1"
@ -34,6 +38,11 @@ def create_french_lemmatizer(data: Dict[str, dict] = {}) -> FrenchLemmatizer:
return FrenchLemmatizer(data=data, is_base_form=is_base_form) return FrenchLemmatizer(data=data, is_base_form=is_base_form)
@registry.language_data("spacy.fr.token_match")
def token_match() -> Pattern:
return TOKEN_MATCH
@registry.language_data("spacy.fr.stop_words") @registry.language_data("spacy.fr.stop_words")
def stop_words() -> Set[str]: def stop_words() -> Set[str]:
return STOP_WORDS return STOP_WORDS
@ -44,13 +53,16 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS return LEX_ATTRS
@registry.language_data("spacy.fr.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class FrenchDefaults(Language.Defaults): class FrenchDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
token_match = TOKEN_MATCH
syntax_iterators = SYNTAX_ITERATORS
class French(Language): class French(Language):

View File

@ -1,26 +1,18 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike): def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
Detect base noun phrases from a dependency parse. Works on both Doc and Span. # fmt: off
""" labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
labels = [ # fmt: on
"nsubj",
"nsubj:pass",
"obj",
"iobj",
"ROOT",
"appos",
"nmod",
"nmod:poss",
]
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")

View File

@ -1,8 +1,11 @@
import re import re
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .punctuation import ELISION, HYPHENS from .punctuation import ELISION, HYPHENS
from ..char_classes import ALPHA_LOWER, ALPHA from ..char_classes import ALPHA_LOWER, ALPHA
from ...symbols import ORTH, LEMMA from ...symbols import ORTH, LEMMA
from ...util import update_exc
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer # not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS # from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
@ -452,7 +455,7 @@ _regular_exp += [
] ]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
TOKEN_MATCH = re.compile( TOKEN_MATCH = re.compile(
"(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp) "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
).match ).match

View File

@ -3,9 +3,8 @@ from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -21,7 +20,7 @@ def stop_words() -> Set[str]:
class IrishDefaults(Language.Defaults): class IrishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
class Irish(Language): class Irish(Language):

View File

@ -1,5 +1,7 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX
from ...symbols import ORTH, LEMMA, NORM from ...symbols import ORTH, LEMMA, NORM
from ...util import update_exc
_exc = { _exc = {
@ -81,4 +83,4 @@ for orth in ["d'", "D'"]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,7 +4,7 @@ from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -25,7 +25,7 @@ def stop_words() -> Set[str]:
class HebrewDefaults(Language.Defaults): class HebrewDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = BASE_EXCEPTIONS
class Hebrew(Language): class Hebrew(Language):

View File

@ -4,7 +4,7 @@ from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -28,7 +28,7 @@ def stop_words() -> Set[str]:
class CroatianDefaults(Language.Defaults): class CroatianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = BASE_EXCEPTIONS
class Croatian(Language): class Croatian(Language):

View File

@ -1,12 +1,11 @@
from typing import Set from typing import Set, Pattern
from thinc.api import Config from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -14,6 +13,10 @@ DEFAULT_CONFIG = """
lang = "hu" lang = "hu"
stop_words = {"@language_data": "spacy.hu.stop_words"} stop_words = {"@language_data": "spacy.hu.stop_words"}
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
token_match = {"@language_data": "spacy.hu.token_match"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
@ -29,12 +32,16 @@ def stop_words() -> Set[str]:
return STOP_WORDS return STOP_WORDS
@registry.language_data("spacy.hu.token_match")
def token_match() -> Pattern:
return TOKEN_MATCH
class HungarianDefaults(Language.Defaults): class HungarianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
token_match = TOKEN_MATCH
class Hungarian(Language): class Hungarian(Language):

View File

@ -1,7 +1,9 @@
import re import re
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..punctuation import ALPHA_LOWER, CURRENCY from ..punctuation import ALPHA_LOWER, CURRENCY
from ...symbols import ORTH from ...symbols import ORTH
from ...util import update_exc
_exc = {} _exc = {}
@ -644,5 +646,5 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
) )
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match

View File

@ -5,10 +5,9 @@ from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import noun_chunks
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -16,6 +15,7 @@ DEFAULT_CONFIG = """
lang = "id" lang = "id"
stop_words = {"@language_data": "spacy.id.stop_words"} stop_words = {"@language_data": "spacy.id.stop_words"}
lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"} lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.id.get_noun_chunks"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
@ -42,12 +42,16 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS return LEX_ATTRS
@registry.language_data("spacy.id.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class IndonesianDefaults(Language.Defaults): class IndonesianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS
class Indonesian(Language): class Indonesian(Language):

View File

@ -1,26 +1,20 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike): def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
""" """
Detect base noun phrases from a dependency parse. Works on both Doc and Span. Detect base noun phrases from a dependency parse. Works on both Doc and Span.
""" """
labels = [ # fmt: off
"nsubj", labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
"nsubj:pass", # fmt: on
"obj",
"iobj",
"ROOT",
"appos",
"nmod",
"nmod:poss",
]
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")

View File

@ -1,5 +1,8 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM from ...symbols import ORTH, LEMMA, NORM
from ...util import update_exc
# Daftar singkatan dan Akronim dari: # Daftar singkatan dan Akronim dari:
# https://id.wiktionary.org/wiki/Wiktionary:Daftar_singkatan_dan_akronim_bahasa_Indonesia#A # https://id.wiktionary.org/wiki/Wiktionary:Daftar_singkatan_dan_akronim_bahasa_Indonesia#A
@ -221,4 +224,4 @@ for orth in [
]: ]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,9 +4,8 @@ from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -30,7 +29,7 @@ def stop_words() -> Set[str]:
class ItalianDefaults(Language.Defaults): class ItalianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES

View File

@ -1,4 +1,7 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA from ...symbols import ORTH, LEMMA
from ...util import update_exc
_exc = { _exc = {
"all'art.": [{ORTH: "all'"}, {ORTH: "art."}], "all'art.": [{ORTH: "all'"}, {ORTH: "art."}],
@ -52,4 +55,4 @@ for orth in [
]: ]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,11 +1,11 @@
from typing import Optional, Union, Dict, Any, Set from typing import Optional, Union, Dict, Any, Set, Callable
from pathlib import Path from pathlib import Path
import srsly import srsly
from collections import namedtuple from collections import namedtuple
from thinc.api import Config from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import noun_chunks
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .tag_orth_map import TAG_ORTH_MAP from .tag_orth_map import TAG_ORTH_MAP
from .tag_bigram_map import TAG_BIGRAM_MAP from .tag_bigram_map import TAG_BIGRAM_MAP
@ -22,6 +22,7 @@ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "ja" lang = "ja"
stop_words = {"@language_data": "spacy.ja.stop_words"} stop_words = {"@language_data": "spacy.ja.stop_words"}
get_noun_chunks = {"@language_data": "spacy.ja.get_noun_chunks"}
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.JapaneseTokenizer.v1" @tokenizers = "spacy.JapaneseTokenizer.v1"
@ -39,6 +40,11 @@ def stop_words() -> Set[str]:
return STOP_WORDS return STOP_WORDS
@registry.language_data("spacy.ja.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
@registry.tokenizers("spacy.JapaneseTokenizer.v1") @registry.tokenizers("spacy.JapaneseTokenizer.v1")
def create_japanese_tokenizer(split_mode: Optional[str] = None): def create_japanese_tokenizer(split_mode: Optional[str] = None):
def japanese_tokenizer_factory(nlp): def japanese_tokenizer_factory(nlp):
@ -50,6 +56,8 @@ def create_japanese_tokenizer(split_mode: Optional[str] = None):
class JapaneseTokenizer(DummyTokenizer): class JapaneseTokenizer(DummyTokenizer):
def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None: def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
self.vocab = nlp.vocab self.vocab = nlp.vocab
# TODO: is this the right way to do it?
self.vocab.morphology.load_tag_map(TAG_MAP)
self.split_mode = split_mode self.split_mode = split_mode
self.tokenizer = try_sudachi_import(self.split_mode) self.tokenizer = try_sudachi_import(self.split_mode)
@ -171,14 +179,8 @@ class JapaneseTokenizer(DummyTokenizer):
return self return self
class JapaneseDefaults(Language.Defaults):
tag_map = TAG_MAP
syntax_iterators = SYNTAX_ITERATORS
class Japanese(Language): class Japanese(Language):
lang = "ja" lang = "ja"
Defaults = JapaneseDefaults
default_config = Config().from_str(DEFAULT_CONFIG) default_config = Config().from_str(DEFAULT_CONFIG)

View File

@ -1,33 +1,23 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON, VERB from ...symbols import NOUN, PROPN, PRON, VERB
from ...tokens import Doc, Span
# XXX this can probably be pruned a bit
labels = [
"nsubj",
"nmod",
"dobj",
"nsubjpass",
"pcomp",
"pobj",
"obj",
"obl",
"dative",
"appos",
"attr",
"ROOT",
]
def noun_chunks(obj): # TODO: this can probably be pruned a bit
""" # fmt: off
Detect base noun phrases from a dependency parse. Works on both Doc and Span. labels = ["nsubj", "nmod", "ddoclike", "nsubjpass", "pcomp", "pdoclike", "doclike", "obl", "dative", "appos", "attr", "ROOT"]
""" # fmt: on
doc = obj.doc # Ensure works on both Doc and Span.
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
doc = doclike.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings.add(label) for label in labels] np_deps = [doc.vocab.strings.add(label) for label in labels]
doc.vocab.strings.add("conj") doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
seen = set() seen = set()
for i, word in enumerate(obj): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
continue continue
# Prevent nested chunks from being produced # Prevent nested chunks from being produced
@ -37,12 +27,10 @@ def noun_chunks(obj):
unseen = [w.i for w in word.subtree if w.i not in seen] unseen = [w.i for w in word.subtree if w.i not in seen]
if not unseen: if not unseen:
continue continue
# this takes care of particles etc. # this takes care of particles etc.
seen.update(j.i for j in word.subtree) seen.update(j.i for j in word.subtree)
# This avoids duplicating embedded clauses # This avoids duplicating embedded clauses
seen.update(range(word.i + 1)) seen.update(range(word.i + 1))
# if the head of this is a verb, mark that and rights seen # if the head of this is a verb, mark that and rights seen
# Don't do the subtree as that can hide other phrases # Don't do the subtree as that can hide other phrases
if word.head.pos == VERB: if word.head.pos == VERB:

View File

@ -40,6 +40,8 @@ def create_korean_tokenizer():
class KoreanTokenizer(DummyTokenizer): class KoreanTokenizer(DummyTokenizer):
def __init__(self, nlp: Optional[Language] = None): def __init__(self, nlp: Optional[Language] = None):
self.vocab = nlp.vocab self.vocab = nlp.vocab
# TODO: is this the right way to do it?
self.vocab.morphology.load_tag_map(TAG_MAP)
MeCab = try_mecab_import() MeCab = try_mecab_import()
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
@ -72,13 +74,8 @@ class KoreanTokenizer(DummyTokenizer):
yield {"surface": surface, "lemma": lemma, "tag": tag} yield {"surface": surface, "lemma": lemma, "tag": tag}
class KoreanDefaults(Language.Defaults):
tag_map = TAG_MAP
class Korean(Language): class Korean(Language):
lang = "ko" lang = "ko"
Defaults = KoreanDefaults
default_config = Config().from_str(DEFAULT_CONFIG) default_config = Config().from_str(DEFAULT_CONFIG)

View File

@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -42,7 +41,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class LuxembourgishDefaults(Language.Defaults): class LuxembourgishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES

View File

@ -1,4 +1,7 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM from ...symbols import ORTH, LEMMA, NORM
from ...util import update_exc
# TODO # TODO
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions) # treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
@ -47,4 +50,4 @@ for orth in [
]: ]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,9 +4,8 @@ from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -22,7 +21,7 @@ def stop_words() -> Set[str]:
class LigurianDefaults(Language.Defaults): class LigurianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES

View File

@ -1,4 +1,7 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA from ...symbols import ORTH, LEMMA
from ...util import update_exc
_exc = {} _exc = {}
@ -47,4 +50,4 @@ for prep, prep_lemma in [
{ORTH: prep, LEMMA: prep_lemma}, {ORTH: prep, LEMMA: prep_lemma},
] ]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -5,9 +5,8 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -39,11 +38,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class LithuanianDefaults(Language.Defaults): class LithuanianDefaults(Language.Defaults):
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
mod_base_exceptions = { tokenizer_exceptions = TOKENIZER_EXCEPTIONS
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
}
del mod_base_exceptions["8)"]
tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
class Lithuanian(Language): class Lithuanian(Language):

View File

@ -1,267 +1,15 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH from ...symbols import ORTH
from ...util import update_exc
_exc = {} _exc = {}
for orth in [ for orth in ["n-tosios", "?!"]:
"n-tosios",
"?!",
# "G.",
# "J. E.",
# "J. Em.",
# "J.E.",
# "J.Em.",
# "K.",
# "N.",
# "V.",
# "Vt.",
# "a.",
# "a.k.",
# "a.s.",
# "adv.",
# "akad.",
# "aklg.",
# "akt.",
# "al.",
# "ang.",
# "angl.",
# "aps.",
# "apskr.",
# "apyg.",
# "arbat.",
# "asist.",
# "asm.",
# "asm.k.",
# "asmv.",
# "atk.",
# "atsak.",
# "atsisk.",
# "atsisk.sąsk.",
# "atv.",
# "aut.",
# "avd.",
# "b.k.",
# "baud.",
# "biol.",
# "bkl.",
# "bot.",
# "bt.",
# "buv.",
# "ch.",
# "chem.",
# "corp.",
# "d.",
# "dab.",
# "dail.",
# "dek.",
# "deš.",
# "dir.",
# "dirig.",
# "doc.",
# "dol.",
# "dr.",
# "drp.",
# "dvit.",
# "dėst.",
# "dš.",
# "dž.",
# "e.b.",
# "e.bankas",
# "e.p.",
# "e.parašas",
# "e.paštas",
# "e.v.",
# "e.valdžia",
# "egz.",
# "eil.",
# "ekon.",
# "el.",
# "el.bankas",
# "el.p.",
# "el.parašas",
# "el.paštas",
# "el.valdžia",
# "etc.",
# "ež.",
# "fak.",
# "faks.",
# "feat.",
# "filol.",
# "filos.",
# "g.",
# "gen.",
# "geol.",
# "gerb.",
# "gim.",
# "gr.",
# "gv.",
# "gyd.",
# "gyv.",
# "habil.",
# "inc.",
# "insp.",
# "inž.",
# "ir pan.",
# "ir t. t.",
# "isp.",
# "istor.",
# "it.",
# "just.",
# "k.",
# "k. a.",
# "k.a.",
# "kab.",
# "kand.",
# "kart.",
# "kat.",
# "ketv.",
# "kh.",
# "kl.",
# "kln.",
# "km.",
# "kn.",
# "koresp.",
# "kpt.",
# "kr.",
# "kt.",
# "kub.",
# "kun.",
# "kv.",
# "kyš.",
# "l. e. p.",
# "l.e.p.",
# "lenk.",
# "liet.",
# "lot.",
# "lt.",
# "ltd.",
# "ltn.",
# "m.",
# "m.e..",
# "m.m.",
# "mat.",
# "med.",
# "mgnt.",
# "mgr.",
# "min.",
# "mjr.",
# "ml.",
# "mln.",
# "mlrd.",
# "mob.",
# "mok.",
# "moksl.",
# "mokyt.",
# "mot.",
# "mr.",
# "mst.",
# "mstl.",
# "mėn.",
# "nkt.",
# "no.",
# "nr.",
# "ntk.",
# "nuotr.",
# "op.",
# "org.",
# "orig.",
# "p.",
# "p.d.",
# "p.m.e.",
# "p.s.",
# "pab.",
# "pan.",
# "past.",
# "pav.",
# "pavad.",
# "per.",
# "perd.",
# "pirm.",
# "pl.",
# "plg.",
# "plk.",
# "pr.",
# "pr.Kr.",
# "pranc.",
# "proc.",
# "prof.",
# "prom.",
# "prot.",
# "psl.",
# "pss.",
# "pvz.",
# "pšt.",
# "r.",
# "raj.",
# "red.",
# "rez.",
# "rež.",
# "rus.",
# "rš.",
# "s.",
# "sav.",
# "saviv.",
# "sek.",
# "sekr.",
# "sen.",
# "sh.",
# "sk.",
# "skg.",
# "skv.",
# "skyr.",
# "sp.",
# "spec.",
# "sr.",
# "st.",
# "str.",
# "stud.",
# "sąs.",
# "t.",
# "t. p.",
# "t. y.",
# "t.p.",
# "t.t.",
# "t.y.",
# "techn.",
# "tel.",
# "teol.",
# "th.",
# "tir.",
# "trit.",
# "trln.",
# "tšk.",
# "tūks.",
# "tūkst.",
# "up.",
# "upl.",
# "v.s.",
# "vad.",
# "val.",
# "valg.",
# "ved.",
# "vert.",
# "vet.",
# "vid.",
# "virš.",
# "vlsč.",
# "vnt.",
# "vok.",
# "vs.",
# "vtv.",
# "vv.",
# "vyr.",
# "vyresn.",
# "zool.",
# "Įn",
# "įl.",
# "š.m.",
# "šnek.",
# "šv.",
# "švč.",
# "ž.ū.",
# "žin.",
# "žml.",
# "žr.",
]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc mod_base_exceptions = {
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
}
del mod_base_exceptions["8)"]
TOKENIZER_EXCEPTIONS = update_exc(mod_base_exceptions, _exc)

View File

@ -1,20 +1,20 @@
from typing import Set from typing import Set, Callable
from thinc.api import Config from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import noun_chunks
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "nb" lang = "nb"
stop_words = {"@language_data": "spacy.nb.stop_words"} stop_words = {"@language_data": "spacy.nb.stop_words"}
get_noun_chunks = {"@language_data": "spacy.nb.get_noun_chunks"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
@ -31,12 +31,16 @@ def stop_words() -> Set[str]:
return STOP_WORDS return STOP_WORDS
@registry.language_data("spacy.nb.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class NorwegianDefaults(Language.Defaults): class NorwegianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
syntax_iterators = SYNTAX_ITERATORS
class Norwegian(Language): class Norwegian(Language):

View File

@ -1,26 +1,18 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike): def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
Detect base noun phrases from a dependency parse. Works on both Doc and Span. # fmt: off
""" labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
labels = [ # fmt: on
"nsubj",
"nsubj:pass",
"obj",
"iobj",
"ROOT",
"appos",
"nmod",
"nmod:poss",
]
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA from ...symbols import ORTH, LEMMA
from ...util import update_exc
_exc = {} _exc = {}
@ -218,4 +220,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -7,9 +7,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .lemmatizer import DutchLemmatizer from .lemmatizer import DutchLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -44,7 +43,7 @@ def create_dutch_lemmatizer(data: Dict[str, dict] = {}) -> DutchLemmatizer:
class DutchDefaults(Language.Defaults): class DutchDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -1,4 +1,7 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH from ...symbols import ORTH
from ...util import update_exc
# Extensive list of both common and uncommon dutch abbreviations copied from # Extensive list of both common and uncommon dutch abbreviations copied from
# github.com/diasks2/pragmatic_segmenter, a Ruby library for rule-based # github.com/diasks2/pragmatic_segmenter, a Ruby library for rule-based
@ -1602,4 +1605,4 @@ for orth in abbrevs:
_exc[i] = [{ORTH: i}] _exc[i] = [{ORTH: i}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,10 +4,9 @@ from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -37,7 +36,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class PortugueseDefaults(Language.Defaults): class PortugueseDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH from ...symbols import ORTH
from ...util import update_exc
_exc = {} _exc = {}
@ -50,4 +52,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
# Lemma data note: # Lemma data note:
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/ # Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
@ -35,7 +34,7 @@ def stop_words() -> Set[str]:
class RomanianDefaults(Language.Defaults): class RomanianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH from ...symbols import ORTH
from ...util import update_exc
from .punctuation import _make_ro_variants from .punctuation import _make_ro_variants
@ -91,4 +93,4 @@ for orth in [
_exc[variant] = [{ORTH: variant}] _exc[variant] = [{ORTH: variant}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -5,8 +5,7 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer from .lemmatizer import RussianLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...util import registry
from ...util import update_exc, registry
from ...language import Language from ...language import Language
@ -42,7 +41,7 @@ def create_russian_lemmatizer() -> RussianLemmatizer:
class RussianDefaults(Language.Defaults): class RussianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
class Russian(Language): class Russian(Language):

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM from ...symbols import ORTH, LEMMA, NORM
from ...util import update_exc
_exc = {} _exc = {}
@ -63,4 +65,4 @@ for slang_desc in _slang_exc:
_exc[slang_desc[ORTH]] = [slang_desc] _exc[slang_desc[ORTH]] = [slang_desc]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,9 +4,8 @@ from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -41,7 +40,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class SerbianDefaults(Language.Defaults): class SerbianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
class Serbian(Language): class Serbian(Language):

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM from ...symbols import ORTH, LEMMA, NORM
from ...util import update_exc
_exc = {} _exc = {}
@ -90,4 +92,4 @@ for slang_desc in _slang_exc:
_exc[slang_desc[ORTH]] = [slang_desc] _exc[slang_desc[ORTH]] = [slang_desc]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,10 +4,9 @@ from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import noun_chunks
# Punctuation stolen from Danish # Punctuation stolen from Danish
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
@ -18,6 +17,7 @@ DEFAULT_CONFIG = """
lang = "sv" lang = "sv"
stop_words = {"@language_data": "spacy.sv.stop_words"} stop_words = {"@language_data": "spacy.sv.stop_words"}
lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"} lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
get_noun_chunks = {"@language_data": "spacy.sv.get_noun_chunks"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
@ -39,11 +39,15 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS return LEX_ATTRS
@registry.language_data("spacy.sv.get_noun_chunks")
def get_noun_chunks() -> Callable:
return noun_chunks
class SwedishDefaults(Language.Defaults): class SwedishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
syntax_iterators = SYNTAX_ITERATORS
class Swedish(Language): class Swedish(Language):

View File

@ -1,27 +1,18 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike): def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
Detect base noun phrases from a dependency parse. Works on both Doc and Span. # fmt: off
""" labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
labels = [ # fmt: on
"nsubj",
"nsubj:pass",
"dobj",
"obj",
"iobj",
"ROOT",
"appos",
"nmod",
"nmod:poss",
]
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA
from ...util import update_exc
_exc = {} _exc = {}
@ -154,4 +156,4 @@ for orth in ABBREVIATIONS:
for orth in ["i", "m"]: for orth in ["i", "m"]:
_exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: "."}] _exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: "."}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,25 +0,0 @@
from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
TAG_MAP = {
"ADV": {POS: ADV},
"NOUN": {POS: NOUN},
"ADP": {POS: ADP},
"PRON": {POS: PRON},
"SCONJ": {POS: SCONJ},
"PROPN": {POS: PROPN},
"DET": {POS: DET},
"SYM": {POS: SYM},
"INTJ": {POS: INTJ},
"PUNCT": {POS: PUNCT},
"NUM": {POS: NUM},
"AUX": {POS: AUX},
"X": {POS: X},
"CONJ": {POS: CONJ},
"CCONJ": {POS: CCONJ},
"ADJ": {POS: ADJ},
"VERB": {POS: VERB},
"PART": {POS: PART},
"_SP": {POS: SPACE},
}

View File

@ -4,9 +4,8 @@ from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -36,7 +35,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class TagalogDefaults(Language.Defaults): class TagalogDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
class Tagalog(Language): class Tagalog(Language):

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA from ...symbols import ORTH, LEMMA
from ...util import update_exc
_exc = { _exc = {
@ -14,4 +16,4 @@ _exc = {
} }
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -55,7 +55,6 @@ URL_PATTERN = (
# fmt: on # fmt: on
).strip() ).strip()
TOKEN_MATCH = None
URL_MATCH = re.compile("(?u)" + URL_PATTERN).match URL_MATCH = re.compile("(?u)" + URL_PATTERN).match

View File

@ -3,9 +3,8 @@ from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -29,7 +28,7 @@ def stop_words() -> Set[str]:
class TurkishDefaults(Language.Defaults): class TurkishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
class Turkish(Language): class Turkish(Language):

View File

@ -1,4 +1,7 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM from ...symbols import ORTH, NORM
from ...util import update_exc
_exc = {"sağol": [{ORTH: "sağ"}, {ORTH: "ol", NORM: "olun"}]} _exc = {"sağol": [{ORTH: "sağ"}, {ORTH: "ol", NORM: "olun"}]}
@ -113,4 +116,4 @@ for orth in ["Dr.", "yy."]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -5,9 +5,8 @@ from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -29,7 +28,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
class TatarDefaults(Language.Defaults): class TatarDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = tuple(TOKENIZER_INFIXES) infixes = tuple(TOKENIZER_INFIXES)

View File

@ -1,4 +1,7 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM from ...symbols import ORTH, LEMMA, NORM
from ...util import update_exc
_exc = {} _exc = {}
@ -43,4 +46,4 @@ for exc_data in [ # "etc." abbreviations
exc_data[LEMMA] = exc_data[NORM] exc_data[LEMMA] = exc_data[NORM]
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,8 +4,7 @@ from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...util import registry
from ...util import update_exc, registry
from ...language import Language from ...language import Language
from .lemmatizer import UkrainianLemmatizer from .lemmatizer import UkrainianLemmatizer
@ -37,7 +36,7 @@ def create_ukrainian_lemmatizer() -> UkrainianLemmatizer:
class UkrainianDefaults(Language.Defaults): class UkrainianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
class Ukrainian(Language): class Ukrainian(Language):

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, POS, NORM, NOUN from ...symbols import ORTH, LEMMA, POS, NORM, NOUN
from ...util import update_exc
_exc = {} _exc = {}
@ -21,4 +23,4 @@ for exc_data in [
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -24,9 +24,7 @@ from .util import link_vectors_to_models, create_default_optimizer, registry
from .util import SimpleFrozenDict from .util import SimpleFrozenDict
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES from .lang.punctuation import TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH, URL_MATCH from .tokens import Doc
from .lang.tag_map import TAG_MAP
from .tokens import Doc, Span
from .errors import Errors, Warnings from .errors import Errors, Warnings
from .schemas import ConfigSchema from .schemas import ConfigSchema
from .git_info import GIT_VERSION from .git_info import GIT_VERSION
@ -37,6 +35,7 @@ from . import about
from .tokenizer import Tokenizer # noqa: F401 from .tokenizer import Tokenizer # noqa: F401
from .lemmatizer import Lemmatizer # noqa: F401 from .lemmatizer import Lemmatizer # noqa: F401
from .lookups import Lookups # noqa: F401 from .lookups import Lookups # noqa: F401
from .lang import defaults # noqa: F401
ENABLE_PIPELINE_ANALYSIS = False ENABLE_PIPELINE_ANALYSIS = False
@ -46,15 +45,10 @@ DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH)
class BaseDefaults: class BaseDefaults:
token_match: Optional[Pattern] = TOKEN_MATCH
url_match: Pattern = URL_MATCH
prefixes: Tuple[Pattern, ...] = tuple(TOKENIZER_PREFIXES) prefixes: Tuple[Pattern, ...] = tuple(TOKENIZER_PREFIXES)
suffixes: Tuple[Pattern, ...] = tuple(TOKENIZER_SUFFIXES) suffixes: Tuple[Pattern, ...] = tuple(TOKENIZER_SUFFIXES)
infixes: Tuple[Pattern, ...] = tuple(TOKENIZER_INFIXES) infixes: Tuple[Pattern, ...] = tuple(TOKENIZER_INFIXES)
tag_map: Dict[str, dict] = dict(TAG_MAP)
tokenizer_exceptions: Dict[str, List[dict]] = {} tokenizer_exceptions: Dict[str, List[dict]] = {}
morph_rules: Dict[str, Dict[str, dict]] = {}
syntax_iterators: Dict[str, Callable[[Union[Doc, Span]], Iterator]] = {}
class Language: class Language:
@ -114,13 +108,7 @@ class Language:
if vocab is True: if vocab is True:
vectors_name = meta.get("vectors", {}).get("name") vectors_name = meta.get("vectors", {}).get("name")
vocab = Vocab.from_config( vocab = Vocab.from_config(self._config, vectors_name=vectors_name)
self._config,
vectors_name=vectors_name,
# TODO: what should we do with these?
tag_map=self.Defaults.tag_map,
morph_rules=self.Defaults.morph_rules,
)
else: else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang): if (self.lang and vocab.lang) and (self.lang != vocab.lang):
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang)) raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
@ -1267,15 +1255,14 @@ class Language:
lex_attr_getters = resolved["nlp"]["lex_attr_getters"] lex_attr_getters = resolved["nlp"]["lex_attr_getters"]
stop_words = resolved["nlp"]["stop_words"] stop_words = resolved["nlp"]["stop_words"]
vocab_data = resolved["nlp"]["vocab_data"] vocab_data = resolved["nlp"]["vocab_data"]
get_noun_chunks = resolved["nlp"]["get_noun_chunks"]
vocab = Vocab.from_config( vocab = Vocab.from_config(
filled, filled,
lemmatizer=lemmatizer, lemmatizer=lemmatizer,
lex_attr_getters=lex_attr_getters, lex_attr_getters=lex_attr_getters,
stop_words=stop_words, stop_words=stop_words,
vocab_data=vocab_data, vocab_data=vocab_data,
# TODO: what should we do with these? get_noun_chunks=get_noun_chunks,
tag_map=cls.Defaults.tag_map,
morph_rules=cls.Defaults.morph_rules,
) )
nlp = cls(vocab, create_tokenizer=create_tokenizer) nlp = cls(vocab, create_tokenizer=create_tokenizer)
pipeline = config.get("components", {}) pipeline = config.get("components", {})

View File

@ -243,6 +243,7 @@ class ConfigSchemaNlp(BaseModel):
stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop") stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop")
lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)") lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)")
vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables") vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables")
get_noun_chunks: Optional[Callable] = Field(..., title="Function to extract noun phrases from a Doc")
# fmt: on # fmt: on
class Config: class Config:

View File

@ -1,7 +1,7 @@
import numpy import numpy
from spacy.attrs import HEAD, DEP from spacy.attrs import HEAD, DEP
from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root
from spacy.lang.en.syntax_iterators import SYNTAX_ITERATORS from spacy.lang.en.syntax_iterators import noun_chunks
import pytest import pytest
@ -41,7 +41,7 @@ def test_en_noun_chunks_not_nested(en_vocab):
dtype="uint64", dtype="uint64",
), ),
) )
doc.noun_chunks_iterator = SYNTAX_ITERATORS["noun_chunks"] doc.noun_chunks_iterator = noun_chunks
word_occurred = {} word_occurred = {}
for chunk in doc.noun_chunks: for chunk in doc.noun_chunks:
for word in chunk: for word in chunk:

View File

@ -9,7 +9,7 @@ from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
cimport cython cimport cython
from typing import Dict, List, Union, Pattern, Optional from typing import Dict, List, Union, Pattern, Optional, Any
import re import re
import warnings import warnings
@ -32,16 +32,16 @@ def create_tokenizer(
# prefixes: Optional[List[Union[str, Pattern]]], # prefixes: Optional[List[Union[str, Pattern]]],
# suffixes: Optional[List[Union[str, Pattern]]], # suffixes: Optional[List[Union[str, Pattern]]],
# infixes: Optional[List[Union[str, Pattern]]], # infixes: Optional[List[Union[str, Pattern]]],
# token_match: Optional[Pattern], # We currently can't validate against Pattern because that will cause
# url_match: Optional[Pattern], # Pydantic to parse value *as* pattern
token_match: Optional[Any] = None,
url_match: Optional[Any] = None,
) -> "Tokenizer": ) -> "Tokenizer":
def tokenizer_factory(nlp): def tokenizer_factory(nlp):
exceptions = nlp.Defaults.tokenizer_exceptions exceptions = nlp.Defaults.tokenizer_exceptions
prefixes = nlp.Defaults.prefixes prefixes = nlp.Defaults.prefixes
suffixes = nlp.Defaults.suffixes suffixes = nlp.Defaults.suffixes
infixes = nlp.Defaults.infixes infixes = nlp.Defaults.infixes
url_match = nlp.Defaults.url_match
token_match = nlp.Defaults.token_match
prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None

View File

@ -89,16 +89,6 @@ cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name)
return get_token_attr(token, feat_name) return get_token_attr(token, feat_name)
def _get_chunker(lang):
try:
cls = util.get_lang_class(lang)
except ImportError:
return None
except KeyError:
return None
return cls.Defaults.syntax_iterators.get("noun_chunks")
cdef class Doc: cdef class Doc:
"""A sequence of Token objects. Access sentences and named entities, export """A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary annotations to numpy arrays, losslessly serialize to compressed binary
@ -212,7 +202,7 @@ cdef class Doc:
self.tensor = numpy.zeros((0,), dtype="float32") self.tensor = numpy.zeros((0,), dtype="float32")
self.user_data = {} if user_data is None else user_data self.user_data = {} if user_data is None else user_data
self._vector = None self._vector = None
self.noun_chunks_iterator = _get_chunker(self.vocab.lang) self.noun_chunks_iterator = self.vocab.get_noun_chunks
cdef bint has_space cdef bint has_space
if words is None and spaces is not None: if words is None and spaces is not None:
raise ValueError("words must be set if spaces is set") raise ValueError("words must be set if spaces is set")

View File

@ -30,6 +30,7 @@ cdef class Vocab:
cpdef public object vectors cpdef public object vectors
cpdef public object lookups cpdef public object lookups
cpdef public object writing_system cpdef public object writing_system
cpdef public object get_noun_chunks
cdef readonly int length cdef readonly int length
cdef public object data_dir cdef public object data_dir
cdef public object lex_attr_getters cdef public object lex_attr_getters

View File

@ -30,10 +30,10 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab DOCS: https://spacy.io/api/vocab
""" """
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, def __init__(self, lex_attr_getters=None, lemmatizer=None,
strings=tuple(), lookups=None, vocab_data={}, strings=tuple(), lookups=None, tag_map={}, vocab_data={},
oov_prob=-20., vectors_name=None, writing_system={}, oov_prob=-20., vectors_name=None, writing_system={},
**deprecated_kwargs): get_noun_chunks=None, **deprecated_kwargs):
"""Create the vocabulary. """Create the vocabulary.
lex_attr_getters (dict): A dictionary mapping attribute IDs to lex_attr_getters (dict): A dictionary mapping attribute IDs to
@ -49,7 +49,6 @@ cdef class Vocab:
RETURNS (Vocab): The newly constructed object. RETURNS (Vocab): The newly constructed object.
""" """
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
tag_map = tag_map if tag_map is not None else {}
if lookups in (None, True, False): if lookups in (None, True, False):
lookups = Lookups() lookups = Lookups()
for name, data in vocab_data.items(): for name, data in vocab_data.items():
@ -71,6 +70,7 @@ cdef class Vocab:
self.vectors = Vectors(name=vectors_name) self.vectors = Vectors(name=vectors_name)
self.lookups = lookups self.lookups = lookups
self.writing_system = writing_system self.writing_system = writing_system
self.get_noun_chunks = get_noun_chunks
@property @property
def lang(self): def lang(self):
@ -424,9 +424,8 @@ cdef class Vocab:
lex_attr_getters=None, lex_attr_getters=None,
stop_words=None, stop_words=None,
vocab_data=None, vocab_data=None,
get_noun_chunks=None,
vectors_name=None, vectors_name=None,
tag_map=None,
morph_rules=None
): ):
"""Create a Vocab from a config and (currently) language defaults, i.e. """Create a Vocab from a config and (currently) language defaults, i.e.
nlp.Defaults. nlp.Defaults.
@ -449,6 +448,9 @@ cdef class Vocab:
if vocab_data is None: if vocab_data is None:
vocab_data_cfg = {"vocab_data": config["nlp"]["vocab_data"]} vocab_data_cfg = {"vocab_data": config["nlp"]["vocab_data"]}
vocab_data = registry.make_from_config(vocab_data_cfg)["vocab_data"] vocab_data = registry.make_from_config(vocab_data_cfg)["vocab_data"]
if get_noun_chunks is None:
noun_chunks_cfg = {"get_noun_chunks": config["nlp"]["get_noun_chunks"]}
get_noun_chunks = registry.make_from_config(noun_chunks_cfg)["get_noun_chunks"]
if lex_attr_getters is None: if lex_attr_getters is None:
lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]} lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]}
lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"] lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"]
@ -468,10 +470,8 @@ cdef class Vocab:
vocab_data=vocab_data, vocab_data=vocab_data,
lemmatizer=lemmatizer, lemmatizer=lemmatizer,
writing_system=writing_system, writing_system=writing_system,
tag_map=tag_map, get_noun_chunks=get_noun_chunks
) )
if morph_rules is not None:
vocab.morphology.load_morph_exceptions(morph_rules)
if vocab.vectors.name is None and vectors_name: if vocab.vectors.name is None and vectors_name:
vocab.vectors.name = vectors_name vocab.vectors.name = vectors_name
return vocab return vocab