mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Tidy up and move noun_chunks, token_match, url_match
This commit is contained in:
parent
7fc4dadd22
commit
b507f61629
|
@ -3,10 +3,13 @@ lang = null
|
|||
stop_words = []
|
||||
lex_attr_getters = {}
|
||||
vocab_data = {}
|
||||
get_noun_chunks = null
|
||||
pipeline = []
|
||||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.Tokenizer.v1"
|
||||
token_match = null
|
||||
url_match = {"@language_data": "spacy.xx.url_match"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
|
|
@ -4,11 +4,9 @@ from thinc.api import Config
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -35,7 +33,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|||
|
||||
|
||||
class ArabicDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, LEMMA
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
@ -43,4 +45,4 @@ for exc_data in [
|
|||
for exc_data in [{LEMMA: "تلفون", ORTH: "ت."}, {LEMMA: "صندوق بريد", ORTH: "ص.ب"}]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -4,9 +4,8 @@ from thinc.api import Config
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -30,7 +29,7 @@ def stop_words() -> Set[str]:
|
|||
|
||||
|
||||
class BengaliDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, LEMMA
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
@ -21,4 +23,4 @@ for exc_data in [
|
|||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -4,9 +4,8 @@ from thinc.api import Config
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
|
||||
|
||||
|
@ -37,7 +36,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|||
|
||||
|
||||
class CatalanDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, LEMMA
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
@ -35,4 +37,4 @@ for h in range(1, 12 + 1):
|
|||
_exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -42,7 +41,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|||
|
||||
|
||||
class DanishDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
||||
|
|
|
@ -2,7 +2,9 @@
|
|||
Tokenizer Exceptions.
|
||||
Source: https://forkortelse.dk/ and various others.
|
||||
"""
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, LEMMA, NORM
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
@ -576,4 +578,4 @@ for h in range(1, 31 + 1):
|
|||
_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: "."}]}
|
||||
_exc.update(_custom_base_exc)
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -1,20 +1,20 @@
|
|||
from typing import Set
|
||||
from typing import Set, Callable
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from .syntax_iterators import noun_chunks
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "de"
|
||||
stop_words = {"@language_data": "spacy.de.stop_words"}
|
||||
get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
@ -36,12 +36,16 @@ def stop_words() -> Set[str]:
|
|||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.de.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
|
||||
|
||||
class GermanDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class German(Language):
|
||||
|
|
|
@ -1,39 +1,26 @@
|
|||
from typing import Union, Iterator
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
from ...errors import Errors
|
||||
from ...tokens import Doc, Span
|
||||
|
||||
|
||||
def noun_chunks(doclike):
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||
"""
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||
# this iterator extracts spans headed by NOUNs starting from the left-most
|
||||
# syntactic dependent until the NOUN itself for close apposition and
|
||||
# measurement construction, the span is sometimes extended to the right of
|
||||
# the NOUN. Example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee"
|
||||
# and not just "eine Tasse", same for "das Thema Familie".
|
||||
labels = [
|
||||
"sb",
|
||||
"oa",
|
||||
"da",
|
||||
"nk",
|
||||
"mo",
|
||||
"ag",
|
||||
"ROOT",
|
||||
"root",
|
||||
"cj",
|
||||
"pd",
|
||||
"og",
|
||||
"app",
|
||||
]
|
||||
# fmt: off
|
||||
labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
|
||||
if not doc.is_parsed:
|
||||
raise ValueError(Errors.E029)
|
||||
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
np_deps = set(doc.vocab.strings.add(label) for label in labels)
|
||||
close_app = doc.vocab.strings.add("nk")
|
||||
|
||||
rbracket = 0
|
||||
for i, word in enumerate(doclike):
|
||||
if i < rbracket:
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {
|
||||
|
@ -254,4 +256,4 @@ for orth in [
|
|||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
9
spacy/lang/defaults.py
Normal file
9
spacy/lang/defaults.py
Normal file
|
@ -0,0 +1,9 @@
|
|||
from typing import Pattern
|
||||
|
||||
from .tokenizer_exceptions import URL_MATCH
|
||||
from ..util import registry
|
||||
|
||||
|
||||
@registry.language_data("spacy.xx.url_match")
|
||||
def url_match() -> Pattern:
|
||||
return URL_MATCH
|
|
@ -5,11 +5,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import GreekLemmatizer
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .syntax_iterators import noun_chunks
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -17,6 +16,7 @@ DEFAULT_CONFIG = """
|
|||
lang = "el"
|
||||
stop_words = {"@language_data": "spacy.el.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
|
||||
get_noun_chunks = {"@language_data": "spacy.el.get_noun_chunks"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.GreekLemmatizer.v1"
|
||||
|
@ -38,6 +38,11 @@ def create_greek_lemmatizer(data: Dict[str, dict] = {}) -> GreekLemmatizer:
|
|||
return GreekLemmatizer(data=data)
|
||||
|
||||
|
||||
@registry.language_data("spacy.el.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
|
||||
|
||||
@registry.language_data("spacy.el.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
@ -49,11 +54,10 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|||
|
||||
|
||||
class GreekDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Greek(Language):
|
||||
|
|
|
@ -1,21 +1,20 @@
|
|||
from typing import Union, Iterator
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
from ...errors import Errors
|
||||
from ...tokens import Doc, Span
|
||||
|
||||
|
||||
def noun_chunks(doclike):
|
||||
"""
|
||||
Detect base noun phrases. Works on both Doc and Span.
|
||||
"""
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||
# It follows the logic of the noun chunks finder of English language,
|
||||
# adjusted to some Greek language special characteristics.
|
||||
# obj tag corrects some DEP tagger mistakes.
|
||||
# Further improvement of the models will eliminate the need for this tag.
|
||||
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
|
||||
if not doc.is_parsed:
|
||||
raise ValueError(Errors.E029)
|
||||
|
||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
nmod = doc.vocab.strings.add("nmod")
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,5 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, LEMMA, NORM
|
||||
|
||||
from ...util import update_exc
|
||||
|
||||
_exc = {}
|
||||
|
||||
|
@ -392,4 +393,4 @@ for orth in [
|
|||
]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -4,13 +4,12 @@ from thinc.api import Config
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .syntax_iterators import noun_chunks
|
||||
from .lemmatizer import is_base_form
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -18,6 +17,7 @@ DEFAULT_CONFIG = """
|
|||
lang = "en"
|
||||
stop_words = {"@language_data": "spacy.en.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
|
||||
get_noun_chunks = {"@language_data": "spacy.en.get_noun_chunks"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.EnglishLemmatizer.v1"
|
||||
|
@ -49,9 +49,13 @@ def create_lemmatizer(data: Dict[str, dict] = {}) -> "Lemmatizer":
|
|||
return Lemmatizer(data=data, is_base_form=is_base_form)
|
||||
|
||||
|
||||
@registry.language_data("spacy.en.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
|
||||
|
||||
class EnglishDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
||||
|
||||
|
|
|
@ -1,27 +1,18 @@
|
|||
from typing import Union, Iterator
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
from ...errors import Errors
|
||||
from ...tokens import Doc, Span
|
||||
|
||||
|
||||
def noun_chunks(doclike):
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||
"""
|
||||
labels = [
|
||||
"nsubj",
|
||||
"dobj",
|
||||
"nsubjpass",
|
||||
"pcomp",
|
||||
"pobj",
|
||||
"dative",
|
||||
"appos",
|
||||
"attr",
|
||||
"ROOT",
|
||||
]
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||
# fmt: off
|
||||
labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
|
||||
if not doc.is_parsed:
|
||||
raise ValueError(Errors.E029)
|
||||
|
||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
@ -555,4 +557,4 @@ for string in _exclude:
|
|||
_exc.pop(string)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -4,11 +4,10 @@ from thinc.config import Config
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .syntax_iterators import noun_chunks
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -16,6 +15,7 @@ DEFAULT_CONFIG = """
|
|||
lang = "es"
|
||||
stop_words = {"@language_data": "spacy.es.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
|
||||
get_noun_chunks = {"@language_data": "spacy.es.get_noun_chunks"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
@ -32,6 +32,11 @@ tables = ["lexeme_cluster", "lexeme_prob", "lexeme_settings"]
|
|||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.es.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
|
||||
|
||||
@registry.language_data("spacy.es.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
@ -43,10 +48,9 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|||
|
||||
|
||||
class SpanishDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Spanish(Language):
|
||||
|
|
|
@ -1,13 +1,15 @@
|
|||
from typing import Union, Iterator, Optional, List, Tuple
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
||||
from ...errors import Errors
|
||||
from ...tokens import Doc, Span, Token
|
||||
|
||||
|
||||
def noun_chunks(doclike):
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||
doc = doclike.doc
|
||||
|
||||
if not doc.is_parsed:
|
||||
raise ValueError(Errors.E029)
|
||||
|
||||
if not len(doc):
|
||||
return
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
|
@ -28,18 +30,24 @@ def noun_chunks(doclike):
|
|||
token = next_token(token)
|
||||
|
||||
|
||||
def is_verb_token(token):
|
||||
def is_verb_token(token: Token) -> bool:
|
||||
return token.pos in [VERB, AUX]
|
||||
|
||||
|
||||
def next_token(token):
|
||||
def next_token(token: Token) -> Optional[Token]:
|
||||
try:
|
||||
return token.nbor()
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
|
||||
def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
|
||||
def noun_bounds(
|
||||
doc: Doc,
|
||||
root: Token,
|
||||
np_left_deps: List[str],
|
||||
np_right_deps: List[str],
|
||||
stop_deps: List[str],
|
||||
) -> Tuple[Token, Token]:
|
||||
left_bound = root
|
||||
for token in reversed(list(root.lefts)):
|
||||
if token.dep in np_left_deps:
|
||||
|
@ -50,12 +58,8 @@ def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
|
|||
left, right = noun_bounds(
|
||||
doc, token, np_left_deps, np_right_deps, stop_deps
|
||||
)
|
||||
if list(
|
||||
filter(
|
||||
lambda t: is_verb_token(t) or t.dep in stop_deps,
|
||||
doc[left_bound.i : right.i],
|
||||
)
|
||||
):
|
||||
filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
|
||||
if list(filter(filter_func, doc[left_bound.i : right.i],)):
|
||||
break
|
||||
else:
|
||||
right_bound = right
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {
|
||||
|
@ -73,4 +75,4 @@ for orth in [
|
|||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -2,12 +2,12 @@ from typing import Set, Dict, Callable, Any
|
|||
from thinc.api import Config
|
||||
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .syntax_iterators import noun_chunks
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -15,6 +15,7 @@ DEFAULT_CONFIG = """
|
|||
lang = "fa"
|
||||
stop_words = {"@language_data": "spacy.fa.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"}
|
||||
get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"}
|
||||
|
||||
[nlp.writing_system]
|
||||
direction = "rtl"
|
||||
|
@ -41,10 +42,14 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.language_data("spacy.fa.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
|
||||
|
||||
class PersianDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Persian(Language):
|
||||
|
|
|
@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -31,7 +30,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|||
class FinnishDefaults(Language.Defaults):
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class Finnish(Language):
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, LEMMA
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
@ -78,4 +80,4 @@ for exc_data in [
|
|||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from typing import Set, Dict, Callable, Any, Pattern
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||
|
@ -7,10 +7,9 @@ from .punctuation import TOKENIZER_SUFFIXES
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import FrenchLemmatizer, is_base_form
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from .syntax_iterators import noun_chunks
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -18,6 +17,11 @@ DEFAULT_CONFIG = """
|
|||
lang = "fr"
|
||||
stop_words = {"@language_data": "spacy.fr.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
|
||||
get_noun_chunks = {"@language_data": "spacy.fr.get_noun_chunks"}
|
||||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.Tokenizer.v1"
|
||||
token_match = {"@language_data": "spacy.fr.token_match"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.FrenchLemmatizer.v1"
|
||||
|
@ -34,6 +38,11 @@ def create_french_lemmatizer(data: Dict[str, dict] = {}) -> FrenchLemmatizer:
|
|||
return FrenchLemmatizer(data=data, is_base_form=is_base_form)
|
||||
|
||||
|
||||
@registry.language_data("spacy.fr.token_match")
|
||||
def token_match() -> Pattern:
|
||||
return TOKEN_MATCH
|
||||
|
||||
|
||||
@registry.language_data("spacy.fr.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
@ -44,13 +53,16 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.language_data("spacy.fr.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
|
||||
|
||||
class FrenchDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
token_match = TOKEN_MATCH
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class French(Language):
|
||||
|
|
|
@ -1,26 +1,18 @@
|
|||
from typing import Union, Iterator
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
from ...errors import Errors
|
||||
from ...tokens import Doc, Span
|
||||
|
||||
|
||||
def noun_chunks(doclike):
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||
"""
|
||||
labels = [
|
||||
"nsubj",
|
||||
"nsubj:pass",
|
||||
"obj",
|
||||
"iobj",
|
||||
"ROOT",
|
||||
"appos",
|
||||
"nmod",
|
||||
"nmod:poss",
|
||||
]
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||
# fmt: off
|
||||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
|
||||
if not doc.is_parsed:
|
||||
raise ValueError(Errors.E029)
|
||||
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
import re
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from .punctuation import ELISION, HYPHENS
|
||||
from ..char_classes import ALPHA_LOWER, ALPHA
|
||||
from ...symbols import ORTH, LEMMA
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
|
||||
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
|
||||
|
@ -452,7 +455,7 @@ _regular_exp += [
|
|||
]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
TOKEN_MATCH = re.compile(
|
||||
"(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
|
||||
).match
|
||||
|
|
|
@ -3,9 +3,8 @@ from thinc.api import Config
|
|||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -21,7 +20,7 @@ def stop_words() -> Set[str]:
|
|||
|
||||
|
||||
class IrishDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class Irish(Language):
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX
|
||||
from ...symbols import ORTH, LEMMA, NORM
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {
|
||||
|
@ -81,4 +83,4 @@ for orth in ["d'", "D'"]:
|
|||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -4,7 +4,7 @@ from thinc.api import Config
|
|||
from .stop_words import STOP_WORDS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -25,7 +25,7 @@ def stop_words() -> Set[str]:
|
|||
|
||||
|
||||
class HebrewDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
|
||||
|
||||
class Hebrew(Language):
|
||||
|
|
|
@ -4,7 +4,7 @@ from thinc.api import Config
|
|||
from .stop_words import STOP_WORDS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -28,7 +28,7 @@ def stop_words() -> Set[str]:
|
|||
|
||||
|
||||
class CroatianDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
|
||||
|
||||
class Croatian(Language):
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
from typing import Set
|
||||
from typing import Set, Pattern
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -14,6 +13,10 @@ DEFAULT_CONFIG = """
|
|||
lang = "hu"
|
||||
stop_words = {"@language_data": "spacy.hu.stop_words"}
|
||||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.Tokenizer.v1"
|
||||
token_match = {"@language_data": "spacy.hu.token_match"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
|
@ -29,12 +32,16 @@ def stop_words() -> Set[str]:
|
|||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.hu.token_match")
|
||||
def token_match() -> Pattern:
|
||||
return TOKEN_MATCH
|
||||
|
||||
|
||||
class HungarianDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
token_match = TOKEN_MATCH
|
||||
|
||||
|
||||
class Hungarian(Language):
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
import re
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..punctuation import ALPHA_LOWER, CURRENCY
|
||||
from ...symbols import ORTH
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
@ -644,5 +646,5 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
|
|||
)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match
|
||||
|
|
|
@ -5,10 +5,9 @@ from .stop_words import STOP_WORDS
|
|||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from .syntax_iterators import noun_chunks
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -16,6 +15,7 @@ DEFAULT_CONFIG = """
|
|||
lang = "id"
|
||||
stop_words = {"@language_data": "spacy.id.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
|
||||
get_noun_chunks = {"@language_data": "spacy.id.get_noun_chunks"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
@ -42,12 +42,16 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.language_data("spacy.id.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
|
||||
|
||||
class IndonesianDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Indonesian(Language):
|
||||
|
|
|
@ -1,26 +1,20 @@
|
|||
from typing import Union, Iterator
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
from ...errors import Errors
|
||||
from ...tokens import Doc, Span
|
||||
|
||||
|
||||
def noun_chunks(doclike):
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||
"""
|
||||
labels = [
|
||||
"nsubj",
|
||||
"nsubj:pass",
|
||||
"obj",
|
||||
"iobj",
|
||||
"ROOT",
|
||||
"appos",
|
||||
"nmod",
|
||||
"nmod:poss",
|
||||
]
|
||||
# fmt: off
|
||||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
|
||||
if not doc.is_parsed:
|
||||
raise ValueError(Errors.E029)
|
||||
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, LEMMA, NORM
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
# Daftar singkatan dan Akronim dari:
|
||||
# https://id.wiktionary.org/wiki/Wiktionary:Daftar_singkatan_dan_akronim_bahasa_Indonesia#A
|
||||
|
@ -221,4 +224,4 @@ for orth in [
|
|||
]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -4,9 +4,8 @@ from thinc.api import Config
|
|||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -30,7 +29,7 @@ def stop_words() -> Set[str]:
|
|||
|
||||
|
||||
class ItalianDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, LEMMA
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {
|
||||
"all'art.": [{ORTH: "all'"}, {ORTH: "art."}],
|
||||
|
@ -52,4 +55,4 @@ for orth in [
|
|||
]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
from typing import Optional, Union, Dict, Any, Set
|
||||
from typing import Optional, Union, Dict, Any, Set, Callable
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
from collections import namedtuple
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .syntax_iterators import noun_chunks
|
||||
from .tag_map import TAG_MAP
|
||||
from .tag_orth_map import TAG_ORTH_MAP
|
||||
from .tag_bigram_map import TAG_BIGRAM_MAP
|
||||
|
@ -22,6 +22,7 @@ DEFAULT_CONFIG = """
|
|||
[nlp]
|
||||
lang = "ja"
|
||||
stop_words = {"@language_data": "spacy.ja.stop_words"}
|
||||
get_noun_chunks = {"@language_data": "spacy.ja.get_noun_chunks"}
|
||||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.JapaneseTokenizer.v1"
|
||||
|
@ -39,6 +40,11 @@ def stop_words() -> Set[str]:
|
|||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.ja.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.JapaneseTokenizer.v1")
|
||||
def create_japanese_tokenizer(split_mode: Optional[str] = None):
|
||||
def japanese_tokenizer_factory(nlp):
|
||||
|
@ -50,6 +56,8 @@ def create_japanese_tokenizer(split_mode: Optional[str] = None):
|
|||
class JapaneseTokenizer(DummyTokenizer):
|
||||
def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
|
||||
self.vocab = nlp.vocab
|
||||
# TODO: is this the right way to do it?
|
||||
self.vocab.morphology.load_tag_map(TAG_MAP)
|
||||
self.split_mode = split_mode
|
||||
self.tokenizer = try_sudachi_import(self.split_mode)
|
||||
|
||||
|
@ -171,14 +179,8 @@ class JapaneseTokenizer(DummyTokenizer):
|
|||
return self
|
||||
|
||||
|
||||
class JapaneseDefaults(Language.Defaults):
|
||||
tag_map = TAG_MAP
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Japanese(Language):
|
||||
lang = "ja"
|
||||
Defaults = JapaneseDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
|
|
|
@ -1,33 +1,23 @@
|
|||
from typing import Union, Iterator
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON, VERB
|
||||
|
||||
# XXX this can probably be pruned a bit
|
||||
labels = [
|
||||
"nsubj",
|
||||
"nmod",
|
||||
"dobj",
|
||||
"nsubjpass",
|
||||
"pcomp",
|
||||
"pobj",
|
||||
"obj",
|
||||
"obl",
|
||||
"dative",
|
||||
"appos",
|
||||
"attr",
|
||||
"ROOT",
|
||||
]
|
||||
from ...tokens import Doc, Span
|
||||
|
||||
|
||||
def noun_chunks(obj):
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||
"""
|
||||
# TODO: this can probably be pruned a bit
|
||||
# fmt: off
|
||||
labels = ["nsubj", "nmod", "ddoclike", "nsubjpass", "pcomp", "pdoclike", "doclike", "obl", "dative", "appos", "attr", "ROOT"]
|
||||
# fmt: on
|
||||
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||
doc.vocab.strings.add("conj")
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
seen = set()
|
||||
for i, word in enumerate(obj):
|
||||
for i, word in enumerate(doclike):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
|
@ -37,12 +27,10 @@ def noun_chunks(obj):
|
|||
unseen = [w.i for w in word.subtree if w.i not in seen]
|
||||
if not unseen:
|
||||
continue
|
||||
|
||||
# this takes care of particles etc.
|
||||
seen.update(j.i for j in word.subtree)
|
||||
# This avoids duplicating embedded clauses
|
||||
seen.update(range(word.i + 1))
|
||||
|
||||
# if the head of this is a verb, mark that and rights seen
|
||||
# Don't do the subtree as that can hide other phrases
|
||||
if word.head.pos == VERB:
|
||||
|
|
|
@ -40,6 +40,8 @@ def create_korean_tokenizer():
|
|||
class KoreanTokenizer(DummyTokenizer):
|
||||
def __init__(self, nlp: Optional[Language] = None):
|
||||
self.vocab = nlp.vocab
|
||||
# TODO: is this the right way to do it?
|
||||
self.vocab.morphology.load_tag_map(TAG_MAP)
|
||||
MeCab = try_mecab_import()
|
||||
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
|
||||
|
||||
|
@ -72,13 +74,8 @@ class KoreanTokenizer(DummyTokenizer):
|
|||
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||
|
||||
|
||||
class KoreanDefaults(Language.Defaults):
|
||||
tag_map = TAG_MAP
|
||||
|
||||
|
||||
class Korean(Language):
|
||||
lang = "ko"
|
||||
Defaults = KoreanDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
|
|
|
@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -42,7 +41,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|||
|
||||
|
||||
class LuxembourgishDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, LEMMA, NORM
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
# TODO
|
||||
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
|
||||
|
@ -47,4 +50,4 @@ for orth in [
|
|||
]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -4,9 +4,8 @@ from thinc.api import Config
|
|||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -22,7 +21,7 @@ def stop_words() -> Set[str]:
|
|||
|
||||
|
||||
class LigurianDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, LEMMA
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
||||
|
@ -47,4 +50,4 @@ for prep, prep_lemma in [
|
|||
{ORTH: prep, LEMMA: prep_lemma},
|
||||
]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -5,9 +5,8 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -39,11 +38,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|||
class LithuanianDefaults(Language.Defaults):
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
mod_base_exceptions = {
|
||||
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
|
||||
}
|
||||
del mod_base_exceptions["8)"]
|
||||
tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class Lithuanian(Language):
|
||||
|
|
|
@ -1,267 +1,15 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
||||
for orth in [
|
||||
"n-tosios",
|
||||
"?!",
|
||||
# "G.",
|
||||
# "J. E.",
|
||||
# "J. Em.",
|
||||
# "J.E.",
|
||||
# "J.Em.",
|
||||
# "K.",
|
||||
# "N.",
|
||||
# "V.",
|
||||
# "Vt.",
|
||||
# "a.",
|
||||
# "a.k.",
|
||||
# "a.s.",
|
||||
# "adv.",
|
||||
# "akad.",
|
||||
# "aklg.",
|
||||
# "akt.",
|
||||
# "al.",
|
||||
# "ang.",
|
||||
# "angl.",
|
||||
# "aps.",
|
||||
# "apskr.",
|
||||
# "apyg.",
|
||||
# "arbat.",
|
||||
# "asist.",
|
||||
# "asm.",
|
||||
# "asm.k.",
|
||||
# "asmv.",
|
||||
# "atk.",
|
||||
# "atsak.",
|
||||
# "atsisk.",
|
||||
# "atsisk.sąsk.",
|
||||
# "atv.",
|
||||
# "aut.",
|
||||
# "avd.",
|
||||
# "b.k.",
|
||||
# "baud.",
|
||||
# "biol.",
|
||||
# "bkl.",
|
||||
# "bot.",
|
||||
# "bt.",
|
||||
# "buv.",
|
||||
# "ch.",
|
||||
# "chem.",
|
||||
# "corp.",
|
||||
# "d.",
|
||||
# "dab.",
|
||||
# "dail.",
|
||||
# "dek.",
|
||||
# "deš.",
|
||||
# "dir.",
|
||||
# "dirig.",
|
||||
# "doc.",
|
||||
# "dol.",
|
||||
# "dr.",
|
||||
# "drp.",
|
||||
# "dvit.",
|
||||
# "dėst.",
|
||||
# "dš.",
|
||||
# "dž.",
|
||||
# "e.b.",
|
||||
# "e.bankas",
|
||||
# "e.p.",
|
||||
# "e.parašas",
|
||||
# "e.paštas",
|
||||
# "e.v.",
|
||||
# "e.valdžia",
|
||||
# "egz.",
|
||||
# "eil.",
|
||||
# "ekon.",
|
||||
# "el.",
|
||||
# "el.bankas",
|
||||
# "el.p.",
|
||||
# "el.parašas",
|
||||
# "el.paštas",
|
||||
# "el.valdžia",
|
||||
# "etc.",
|
||||
# "ež.",
|
||||
# "fak.",
|
||||
# "faks.",
|
||||
# "feat.",
|
||||
# "filol.",
|
||||
# "filos.",
|
||||
# "g.",
|
||||
# "gen.",
|
||||
# "geol.",
|
||||
# "gerb.",
|
||||
# "gim.",
|
||||
# "gr.",
|
||||
# "gv.",
|
||||
# "gyd.",
|
||||
# "gyv.",
|
||||
# "habil.",
|
||||
# "inc.",
|
||||
# "insp.",
|
||||
# "inž.",
|
||||
# "ir pan.",
|
||||
# "ir t. t.",
|
||||
# "isp.",
|
||||
# "istor.",
|
||||
# "it.",
|
||||
# "just.",
|
||||
# "k.",
|
||||
# "k. a.",
|
||||
# "k.a.",
|
||||
# "kab.",
|
||||
# "kand.",
|
||||
# "kart.",
|
||||
# "kat.",
|
||||
# "ketv.",
|
||||
# "kh.",
|
||||
# "kl.",
|
||||
# "kln.",
|
||||
# "km.",
|
||||
# "kn.",
|
||||
# "koresp.",
|
||||
# "kpt.",
|
||||
# "kr.",
|
||||
# "kt.",
|
||||
# "kub.",
|
||||
# "kun.",
|
||||
# "kv.",
|
||||
# "kyš.",
|
||||
# "l. e. p.",
|
||||
# "l.e.p.",
|
||||
# "lenk.",
|
||||
# "liet.",
|
||||
# "lot.",
|
||||
# "lt.",
|
||||
# "ltd.",
|
||||
# "ltn.",
|
||||
# "m.",
|
||||
# "m.e..",
|
||||
# "m.m.",
|
||||
# "mat.",
|
||||
# "med.",
|
||||
# "mgnt.",
|
||||
# "mgr.",
|
||||
# "min.",
|
||||
# "mjr.",
|
||||
# "ml.",
|
||||
# "mln.",
|
||||
# "mlrd.",
|
||||
# "mob.",
|
||||
# "mok.",
|
||||
# "moksl.",
|
||||
# "mokyt.",
|
||||
# "mot.",
|
||||
# "mr.",
|
||||
# "mst.",
|
||||
# "mstl.",
|
||||
# "mėn.",
|
||||
# "nkt.",
|
||||
# "no.",
|
||||
# "nr.",
|
||||
# "ntk.",
|
||||
# "nuotr.",
|
||||
# "op.",
|
||||
# "org.",
|
||||
# "orig.",
|
||||
# "p.",
|
||||
# "p.d.",
|
||||
# "p.m.e.",
|
||||
# "p.s.",
|
||||
# "pab.",
|
||||
# "pan.",
|
||||
# "past.",
|
||||
# "pav.",
|
||||
# "pavad.",
|
||||
# "per.",
|
||||
# "perd.",
|
||||
# "pirm.",
|
||||
# "pl.",
|
||||
# "plg.",
|
||||
# "plk.",
|
||||
# "pr.",
|
||||
# "pr.Kr.",
|
||||
# "pranc.",
|
||||
# "proc.",
|
||||
# "prof.",
|
||||
# "prom.",
|
||||
# "prot.",
|
||||
# "psl.",
|
||||
# "pss.",
|
||||
# "pvz.",
|
||||
# "pšt.",
|
||||
# "r.",
|
||||
# "raj.",
|
||||
# "red.",
|
||||
# "rez.",
|
||||
# "rež.",
|
||||
# "rus.",
|
||||
# "rš.",
|
||||
# "s.",
|
||||
# "sav.",
|
||||
# "saviv.",
|
||||
# "sek.",
|
||||
# "sekr.",
|
||||
# "sen.",
|
||||
# "sh.",
|
||||
# "sk.",
|
||||
# "skg.",
|
||||
# "skv.",
|
||||
# "skyr.",
|
||||
# "sp.",
|
||||
# "spec.",
|
||||
# "sr.",
|
||||
# "st.",
|
||||
# "str.",
|
||||
# "stud.",
|
||||
# "sąs.",
|
||||
# "t.",
|
||||
# "t. p.",
|
||||
# "t. y.",
|
||||
# "t.p.",
|
||||
# "t.t.",
|
||||
# "t.y.",
|
||||
# "techn.",
|
||||
# "tel.",
|
||||
# "teol.",
|
||||
# "th.",
|
||||
# "tir.",
|
||||
# "trit.",
|
||||
# "trln.",
|
||||
# "tšk.",
|
||||
# "tūks.",
|
||||
# "tūkst.",
|
||||
# "up.",
|
||||
# "upl.",
|
||||
# "v.s.",
|
||||
# "vad.",
|
||||
# "val.",
|
||||
# "valg.",
|
||||
# "ved.",
|
||||
# "vert.",
|
||||
# "vet.",
|
||||
# "vid.",
|
||||
# "virš.",
|
||||
# "vlsč.",
|
||||
# "vnt.",
|
||||
# "vok.",
|
||||
# "vs.",
|
||||
# "vtv.",
|
||||
# "vv.",
|
||||
# "vyr.",
|
||||
# "vyresn.",
|
||||
# "zool.",
|
||||
# "Įn",
|
||||
# "įl.",
|
||||
# "š.m.",
|
||||
# "šnek.",
|
||||
# "šv.",
|
||||
# "švč.",
|
||||
# "ž.ū.",
|
||||
# "žin.",
|
||||
# "žml.",
|
||||
# "žr.",
|
||||
]:
|
||||
for orth in ["n-tosios", "?!"]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
mod_base_exceptions = {
|
||||
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
|
||||
}
|
||||
del mod_base_exceptions["8)"]
|
||||
TOKENIZER_EXCEPTIONS = update_exc(mod_base_exceptions, _exc)
|
||||
|
|
|
@ -1,20 +1,20 @@
|
|||
from typing import Set
|
||||
from typing import Set, Callable
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from .syntax_iterators import noun_chunks
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "nb"
|
||||
stop_words = {"@language_data": "spacy.nb.stop_words"}
|
||||
get_noun_chunks = {"@language_data": "spacy.nb.get_noun_chunks"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
@ -31,12 +31,16 @@ def stop_words() -> Set[str]:
|
|||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.nb.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
|
||||
|
||||
class NorwegianDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Norwegian(Language):
|
||||
|
|
|
@ -1,26 +1,18 @@
|
|||
from typing import Union, Iterator
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
from ...errors import Errors
|
||||
from ...tokens import Doc, Span
|
||||
|
||||
|
||||
def noun_chunks(doclike):
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||
"""
|
||||
labels = [
|
||||
"nsubj",
|
||||
"nsubj:pass",
|
||||
"obj",
|
||||
"iobj",
|
||||
"ROOT",
|
||||
"appos",
|
||||
"nmod",
|
||||
"nmod:poss",
|
||||
]
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||
# fmt: off
|
||||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
|
||||
if not doc.is_parsed:
|
||||
raise ValueError(Errors.E029)
|
||||
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, LEMMA
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
@ -218,4 +220,4 @@ for orth in [
|
|||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -7,9 +7,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .lemmatizer import DutchLemmatizer
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -44,7 +43,7 @@ def create_dutch_lemmatizer(data: Dict[str, dict] = {}) -> DutchLemmatizer:
|
|||
|
||||
|
||||
class DutchDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
# Extensive list of both common and uncommon dutch abbreviations copied from
|
||||
# github.com/diasks2/pragmatic_segmenter, a Ruby library for rule-based
|
||||
|
@ -1602,4 +1605,4 @@ for orth in abbrevs:
|
|||
_exc[i] = [{ORTH: i}]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -4,10 +4,9 @@ from thinc.api import Config
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -37,7 +36,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|||
|
||||
|
||||
class PortugueseDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
@ -50,4 +52,4 @@ for orth in [
|
|||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .stop_words import STOP_WORDS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
# Lemma data note:
|
||||
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
|
||||
|
@ -35,7 +34,7 @@ def stop_words() -> Set[str]:
|
|||
|
||||
|
||||
class RomanianDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH
|
||||
from ...util import update_exc
|
||||
from .punctuation import _make_ro_variants
|
||||
|
||||
|
||||
|
@ -91,4 +93,4 @@ for orth in [
|
|||
_exc[variant] = [{ORTH: variant}]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -5,8 +5,7 @@ from .stop_words import STOP_WORDS
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import RussianLemmatizer
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
from ...language import Language
|
||||
|
||||
|
||||
|
@ -42,7 +41,7 @@ def create_russian_lemmatizer() -> RussianLemmatizer:
|
|||
|
||||
|
||||
class RussianDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class Russian(Language):
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, LEMMA, NORM
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
@ -63,4 +65,4 @@ for slang_desc in _slang_exc:
|
|||
_exc[slang_desc[ORTH]] = [slang_desc]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -4,9 +4,8 @@ from thinc.api import Config
|
|||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -41,7 +40,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|||
|
||||
|
||||
class SerbianDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class Serbian(Language):
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, LEMMA, NORM
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
@ -90,4 +92,4 @@ for slang_desc in _slang_exc:
|
|||
_exc[slang_desc[ORTH]] = [slang_desc]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -4,10 +4,9 @@ from thinc.api import Config
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...util import registry
|
||||
from .syntax_iterators import noun_chunks
|
||||
|
||||
# Punctuation stolen from Danish
|
||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
|
@ -18,6 +17,7 @@ DEFAULT_CONFIG = """
|
|||
lang = "sv"
|
||||
stop_words = {"@language_data": "spacy.sv.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
|
||||
get_noun_chunks = {"@language_data": "spacy.sv.get_noun_chunks"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
@ -39,11 +39,15 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.language_data("spacy.sv.get_noun_chunks")
|
||||
def get_noun_chunks() -> Callable:
|
||||
return noun_chunks
|
||||
|
||||
|
||||
class SwedishDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Swedish(Language):
|
||||
|
|
|
@ -1,27 +1,18 @@
|
|||
from typing import Union, Iterator
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
from ...errors import Errors
|
||||
from ...tokens import Doc, Span
|
||||
|
||||
|
||||
def noun_chunks(doclike):
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||
"""
|
||||
labels = [
|
||||
"nsubj",
|
||||
"nsubj:pass",
|
||||
"dobj",
|
||||
"obj",
|
||||
"iobj",
|
||||
"ROOT",
|
||||
"appos",
|
||||
"nmod",
|
||||
"nmod:poss",
|
||||
]
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||
# fmt: off
|
||||
labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
|
||||
if not doc.is_parsed:
|
||||
raise ValueError(Errors.E029)
|
||||
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA
|
||||
from ...util import update_exc
|
||||
|
||||
_exc = {}
|
||||
|
||||
|
@ -154,4 +156,4 @@ for orth in ABBREVIATIONS:
|
|||
for orth in ["i", "m"]:
|
||||
_exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: "."}]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -1,25 +0,0 @@
|
|||
from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
|
||||
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
|
||||
|
||||
|
||||
TAG_MAP = {
|
||||
"ADV": {POS: ADV},
|
||||
"NOUN": {POS: NOUN},
|
||||
"ADP": {POS: ADP},
|
||||
"PRON": {POS: PRON},
|
||||
"SCONJ": {POS: SCONJ},
|
||||
"PROPN": {POS: PROPN},
|
||||
"DET": {POS: DET},
|
||||
"SYM": {POS: SYM},
|
||||
"INTJ": {POS: INTJ},
|
||||
"PUNCT": {POS: PUNCT},
|
||||
"NUM": {POS: NUM},
|
||||
"AUX": {POS: AUX},
|
||||
"X": {POS: X},
|
||||
"CONJ": {POS: CONJ},
|
||||
"CCONJ": {POS: CCONJ},
|
||||
"ADJ": {POS: ADJ},
|
||||
"VERB": {POS: VERB},
|
||||
"PART": {POS: PART},
|
||||
"_SP": {POS: SPACE},
|
||||
}
|
|
@ -4,9 +4,8 @@ from thinc.api import Config
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -36,7 +35,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|||
|
||||
|
||||
class TagalogDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class Tagalog(Language):
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, LEMMA
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {
|
||||
|
@ -14,4 +16,4 @@ _exc = {
|
|||
}
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -55,7 +55,6 @@ URL_PATTERN = (
|
|||
# fmt: on
|
||||
).strip()
|
||||
|
||||
TOKEN_MATCH = None
|
||||
URL_MATCH = re.compile("(?u)" + URL_PATTERN).match
|
||||
|
||||
|
||||
|
|
|
@ -3,9 +3,8 @@ from thinc.api import Config
|
|||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -29,7 +28,7 @@ def stop_words() -> Set[str]:
|
|||
|
||||
|
||||
class TurkishDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class Turkish(Language):
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, NORM
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {"sağol": [{ORTH: "sağ"}, {ORTH: "ol", NORM: "olun"}]}
|
||||
|
||||
|
@ -113,4 +116,4 @@ for orth in ["Dr.", "yy."]:
|
|||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -5,9 +5,8 @@ from .lex_attrs import LEX_ATTRS
|
|||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -29,7 +28,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
|||
|
||||
|
||||
class TatarDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, LEMMA, NORM
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
||||
|
@ -43,4 +46,4 @@ for exc_data in [ # "etc." abbreviations
|
|||
exc_data[LEMMA] = exc_data[NORM]
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -4,8 +4,7 @@ from thinc.api import Config
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...util import update_exc, registry
|
||||
from ...util import registry
|
||||
from ...language import Language
|
||||
from .lemmatizer import UkrainianLemmatizer
|
||||
|
||||
|
@ -37,7 +36,7 @@ def create_ukrainian_lemmatizer() -> UkrainianLemmatizer:
|
|||
|
||||
|
||||
class UkrainianDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class Ukrainian(Language):
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, LEMMA, POS, NORM, NOUN
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
@ -21,4 +23,4 @@ for exc_data in [
|
|||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -24,9 +24,7 @@ from .util import link_vectors_to_models, create_default_optimizer, registry
|
|||
from .util import SimpleFrozenDict
|
||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .lang.punctuation import TOKENIZER_INFIXES
|
||||
from .lang.tokenizer_exceptions import TOKEN_MATCH, URL_MATCH
|
||||
from .lang.tag_map import TAG_MAP
|
||||
from .tokens import Doc, Span
|
||||
from .tokens import Doc
|
||||
from .errors import Errors, Warnings
|
||||
from .schemas import ConfigSchema
|
||||
from .git_info import GIT_VERSION
|
||||
|
@ -37,6 +35,7 @@ from . import about
|
|||
from .tokenizer import Tokenizer # noqa: F401
|
||||
from .lemmatizer import Lemmatizer # noqa: F401
|
||||
from .lookups import Lookups # noqa: F401
|
||||
from .lang import defaults # noqa: F401
|
||||
|
||||
|
||||
ENABLE_PIPELINE_ANALYSIS = False
|
||||
|
@ -46,15 +45,10 @@ DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH)
|
|||
|
||||
|
||||
class BaseDefaults:
|
||||
token_match: Optional[Pattern] = TOKEN_MATCH
|
||||
url_match: Pattern = URL_MATCH
|
||||
prefixes: Tuple[Pattern, ...] = tuple(TOKENIZER_PREFIXES)
|
||||
suffixes: Tuple[Pattern, ...] = tuple(TOKENIZER_SUFFIXES)
|
||||
infixes: Tuple[Pattern, ...] = tuple(TOKENIZER_INFIXES)
|
||||
tag_map: Dict[str, dict] = dict(TAG_MAP)
|
||||
tokenizer_exceptions: Dict[str, List[dict]] = {}
|
||||
morph_rules: Dict[str, Dict[str, dict]] = {}
|
||||
syntax_iterators: Dict[str, Callable[[Union[Doc, Span]], Iterator]] = {}
|
||||
|
||||
|
||||
class Language:
|
||||
|
@ -114,13 +108,7 @@ class Language:
|
|||
|
||||
if vocab is True:
|
||||
vectors_name = meta.get("vectors", {}).get("name")
|
||||
vocab = Vocab.from_config(
|
||||
self._config,
|
||||
vectors_name=vectors_name,
|
||||
# TODO: what should we do with these?
|
||||
tag_map=self.Defaults.tag_map,
|
||||
morph_rules=self.Defaults.morph_rules,
|
||||
)
|
||||
vocab = Vocab.from_config(self._config, vectors_name=vectors_name)
|
||||
else:
|
||||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
||||
|
@ -1267,15 +1255,14 @@ class Language:
|
|||
lex_attr_getters = resolved["nlp"]["lex_attr_getters"]
|
||||
stop_words = resolved["nlp"]["stop_words"]
|
||||
vocab_data = resolved["nlp"]["vocab_data"]
|
||||
get_noun_chunks = resolved["nlp"]["get_noun_chunks"]
|
||||
vocab = Vocab.from_config(
|
||||
filled,
|
||||
lemmatizer=lemmatizer,
|
||||
lex_attr_getters=lex_attr_getters,
|
||||
stop_words=stop_words,
|
||||
vocab_data=vocab_data,
|
||||
# TODO: what should we do with these?
|
||||
tag_map=cls.Defaults.tag_map,
|
||||
morph_rules=cls.Defaults.morph_rules,
|
||||
get_noun_chunks=get_noun_chunks,
|
||||
)
|
||||
nlp = cls(vocab, create_tokenizer=create_tokenizer)
|
||||
pipeline = config.get("components", {})
|
||||
|
|
|
@ -243,6 +243,7 @@ class ConfigSchemaNlp(BaseModel):
|
|||
stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop")
|
||||
lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)")
|
||||
vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables")
|
||||
get_noun_chunks: Optional[Callable] = Field(..., title="Function to extract noun phrases from a Doc")
|
||||
# fmt: on
|
||||
|
||||
class Config:
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import numpy
|
||||
from spacy.attrs import HEAD, DEP
|
||||
from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root
|
||||
from spacy.lang.en.syntax_iterators import SYNTAX_ITERATORS
|
||||
from spacy.lang.en.syntax_iterators import noun_chunks
|
||||
|
||||
import pytest
|
||||
|
||||
|
@ -41,7 +41,7 @@ def test_en_noun_chunks_not_nested(en_vocab):
|
|||
dtype="uint64",
|
||||
),
|
||||
)
|
||||
doc.noun_chunks_iterator = SYNTAX_ITERATORS["noun_chunks"]
|
||||
doc.noun_chunks_iterator = noun_chunks
|
||||
word_occurred = {}
|
||||
for chunk in doc.noun_chunks:
|
||||
for word in chunk:
|
||||
|
|
|
@ -9,7 +9,7 @@ from cymem.cymem cimport Pool
|
|||
from preshed.maps cimport PreshMap
|
||||
cimport cython
|
||||
|
||||
from typing import Dict, List, Union, Pattern, Optional
|
||||
from typing import Dict, List, Union, Pattern, Optional, Any
|
||||
import re
|
||||
import warnings
|
||||
|
||||
|
@ -32,16 +32,16 @@ def create_tokenizer(
|
|||
# prefixes: Optional[List[Union[str, Pattern]]],
|
||||
# suffixes: Optional[List[Union[str, Pattern]]],
|
||||
# infixes: Optional[List[Union[str, Pattern]]],
|
||||
# token_match: Optional[Pattern],
|
||||
# url_match: Optional[Pattern],
|
||||
# We currently can't validate against Pattern because that will cause
|
||||
# Pydantic to parse value *as* pattern
|
||||
token_match: Optional[Any] = None,
|
||||
url_match: Optional[Any] = None,
|
||||
) -> "Tokenizer":
|
||||
def tokenizer_factory(nlp):
|
||||
exceptions = nlp.Defaults.tokenizer_exceptions
|
||||
prefixes = nlp.Defaults.prefixes
|
||||
suffixes = nlp.Defaults.suffixes
|
||||
infixes = nlp.Defaults.infixes
|
||||
url_match = nlp.Defaults.url_match
|
||||
token_match = nlp.Defaults.token_match
|
||||
prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
|
||||
suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
|
||||
infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None
|
||||
|
|
|
@ -89,16 +89,6 @@ cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name)
|
|||
return get_token_attr(token, feat_name)
|
||||
|
||||
|
||||
def _get_chunker(lang):
|
||||
try:
|
||||
cls = util.get_lang_class(lang)
|
||||
except ImportError:
|
||||
return None
|
||||
except KeyError:
|
||||
return None
|
||||
return cls.Defaults.syntax_iterators.get("noun_chunks")
|
||||
|
||||
|
||||
cdef class Doc:
|
||||
"""A sequence of Token objects. Access sentences and named entities, export
|
||||
annotations to numpy arrays, losslessly serialize to compressed binary
|
||||
|
@ -212,7 +202,7 @@ cdef class Doc:
|
|||
self.tensor = numpy.zeros((0,), dtype="float32")
|
||||
self.user_data = {} if user_data is None else user_data
|
||||
self._vector = None
|
||||
self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
|
||||
self.noun_chunks_iterator = self.vocab.get_noun_chunks
|
||||
cdef bint has_space
|
||||
if words is None and spaces is not None:
|
||||
raise ValueError("words must be set if spaces is set")
|
||||
|
|
|
@ -30,6 +30,7 @@ cdef class Vocab:
|
|||
cpdef public object vectors
|
||||
cpdef public object lookups
|
||||
cpdef public object writing_system
|
||||
cpdef public object get_noun_chunks
|
||||
cdef readonly int length
|
||||
cdef public object data_dir
|
||||
cdef public object lex_attr_getters
|
||||
|
|
|
@ -30,10 +30,10 @@ cdef class Vocab:
|
|||
|
||||
DOCS: https://spacy.io/api/vocab
|
||||
"""
|
||||
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
||||
strings=tuple(), lookups=None, vocab_data={},
|
||||
def __init__(self, lex_attr_getters=None, lemmatizer=None,
|
||||
strings=tuple(), lookups=None, tag_map={}, vocab_data={},
|
||||
oov_prob=-20., vectors_name=None, writing_system={},
|
||||
**deprecated_kwargs):
|
||||
get_noun_chunks=None, **deprecated_kwargs):
|
||||
"""Create the vocabulary.
|
||||
|
||||
lex_attr_getters (dict): A dictionary mapping attribute IDs to
|
||||
|
@ -49,7 +49,6 @@ cdef class Vocab:
|
|||
RETURNS (Vocab): The newly constructed object.
|
||||
"""
|
||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||
tag_map = tag_map if tag_map is not None else {}
|
||||
if lookups in (None, True, False):
|
||||
lookups = Lookups()
|
||||
for name, data in vocab_data.items():
|
||||
|
@ -71,6 +70,7 @@ cdef class Vocab:
|
|||
self.vectors = Vectors(name=vectors_name)
|
||||
self.lookups = lookups
|
||||
self.writing_system = writing_system
|
||||
self.get_noun_chunks = get_noun_chunks
|
||||
|
||||
@property
|
||||
def lang(self):
|
||||
|
@ -424,9 +424,8 @@ cdef class Vocab:
|
|||
lex_attr_getters=None,
|
||||
stop_words=None,
|
||||
vocab_data=None,
|
||||
get_noun_chunks=None,
|
||||
vectors_name=None,
|
||||
tag_map=None,
|
||||
morph_rules=None
|
||||
):
|
||||
"""Create a Vocab from a config and (currently) language defaults, i.e.
|
||||
nlp.Defaults.
|
||||
|
@ -449,6 +448,9 @@ cdef class Vocab:
|
|||
if vocab_data is None:
|
||||
vocab_data_cfg = {"vocab_data": config["nlp"]["vocab_data"]}
|
||||
vocab_data = registry.make_from_config(vocab_data_cfg)["vocab_data"]
|
||||
if get_noun_chunks is None:
|
||||
noun_chunks_cfg = {"get_noun_chunks": config["nlp"]["get_noun_chunks"]}
|
||||
get_noun_chunks = registry.make_from_config(noun_chunks_cfg)["get_noun_chunks"]
|
||||
if lex_attr_getters is None:
|
||||
lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]}
|
||||
lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"]
|
||||
|
@ -468,10 +470,8 @@ cdef class Vocab:
|
|||
vocab_data=vocab_data,
|
||||
lemmatizer=lemmatizer,
|
||||
writing_system=writing_system,
|
||||
tag_map=tag_map,
|
||||
get_noun_chunks=get_noun_chunks
|
||||
)
|
||||
if morph_rules is not None:
|
||||
vocab.morphology.load_morph_exceptions(morph_rules)
|
||||
if vocab.vectors.name is None and vectors_name:
|
||||
vocab.vectors.name = vectors_name
|
||||
return vocab
|
||||
|
|
Loading…
Reference in New Issue
Block a user