mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Tidy up and move noun_chunks, token_match, url_match
This commit is contained in:
parent
7fc4dadd22
commit
b507f61629
|
@ -3,10 +3,13 @@ lang = null
|
||||||
stop_words = []
|
stop_words = []
|
||||||
lex_attr_getters = {}
|
lex_attr_getters = {}
|
||||||
vocab_data = {}
|
vocab_data = {}
|
||||||
|
get_noun_chunks = null
|
||||||
pipeline = []
|
pipeline = []
|
||||||
|
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.Tokenizer.v1"
|
@tokenizers = "spacy.Tokenizer.v1"
|
||||||
|
token_match = null
|
||||||
|
url_match = {"@language_data": "spacy.xx.url_match"}
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
|
@ -4,11 +4,9 @@ from thinc.api import Config
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -35,7 +33,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
|
||||||
|
|
||||||
class ArabicDefaults(Language.Defaults):
|
class ArabicDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, LEMMA
|
from ...symbols import ORTH, LEMMA
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -43,4 +45,4 @@ for exc_data in [
|
||||||
for exc_data in [{LEMMA: "تلفون", ORTH: "ت."}, {LEMMA: "صندوق بريد", ORTH: "ص.ب"}]:
|
for exc_data in [{LEMMA: "تلفون", ORTH: "ت."}, {LEMMA: "صندوق بريد", ORTH: "ص.ب"}]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -4,9 +4,8 @@ from thinc.api import Config
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -30,7 +29,7 @@ def stop_words() -> Set[str]:
|
||||||
|
|
||||||
|
|
||||||
class BengaliDefaults(Language.Defaults):
|
class BengaliDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, LEMMA
|
from ...symbols import ORTH, LEMMA
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -21,4 +23,4 @@ for exc_data in [
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -4,9 +4,8 @@ from thinc.api import Config
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
|
@ -37,7 +36,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
|
||||||
|
|
||||||
class CatalanDefaults(Language.Defaults):
|
class CatalanDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, LEMMA
|
from ...symbols import ORTH, LEMMA
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -35,4 +37,4 @@ for h in range(1, 12 + 1):
|
||||||
_exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}]
|
_exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}]
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -42,7 +41,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
|
||||||
|
|
||||||
class DanishDefaults(Language.Defaults):
|
class DanishDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,9 @@
|
||||||
Tokenizer Exceptions.
|
Tokenizer Exceptions.
|
||||||
Source: https://forkortelse.dk/ and various others.
|
Source: https://forkortelse.dk/ and various others.
|
||||||
"""
|
"""
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, LEMMA, NORM
|
from ...symbols import ORTH, LEMMA, NORM
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -576,4 +578,4 @@ for h in range(1, 31 + 1):
|
||||||
_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: "."}]}
|
_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: "."}]}
|
||||||
_exc.update(_custom_base_exc)
|
_exc.update(_custom_base_exc)
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -1,20 +1,20 @@
|
||||||
from typing import Set
|
from typing import Set, Callable
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import noun_chunks
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "de"
|
lang = "de"
|
||||||
stop_words = {"@language_data": "spacy.de.stop_words"}
|
stop_words = {"@language_data": "spacy.de.stop_words"}
|
||||||
|
get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"}
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
@ -36,12 +36,16 @@ def stop_words() -> Set[str]:
|
||||||
return STOP_WORDS
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.de.get_noun_chunks")
|
||||||
|
def get_noun_chunks() -> Callable:
|
||||||
|
return noun_chunks
|
||||||
|
|
||||||
|
|
||||||
class GermanDefaults(Language.Defaults):
|
class GermanDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
|
||||||
|
|
||||||
|
|
||||||
class German(Language):
|
class German(Language):
|
||||||
|
|
|
@ -1,39 +1,26 @@
|
||||||
|
from typing import Union, Iterator
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike):
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
"""
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
|
||||||
"""
|
|
||||||
# this iterator extracts spans headed by NOUNs starting from the left-most
|
# this iterator extracts spans headed by NOUNs starting from the left-most
|
||||||
# syntactic dependent until the NOUN itself for close apposition and
|
# syntactic dependent until the NOUN itself for close apposition and
|
||||||
# measurement construction, the span is sometimes extended to the right of
|
# measurement construction, the span is sometimes extended to the right of
|
||||||
# the NOUN. Example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee"
|
# the NOUN. Example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee"
|
||||||
# and not just "eine Tasse", same for "das Thema Familie".
|
# and not just "eine Tasse", same for "das Thema Familie".
|
||||||
labels = [
|
# fmt: off
|
||||||
"sb",
|
labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"]
|
||||||
"oa",
|
# fmt: on
|
||||||
"da",
|
|
||||||
"nk",
|
|
||||||
"mo",
|
|
||||||
"ag",
|
|
||||||
"ROOT",
|
|
||||||
"root",
|
|
||||||
"cj",
|
|
||||||
"pd",
|
|
||||||
"og",
|
|
||||||
"app",
|
|
||||||
]
|
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
np_deps = set(doc.vocab.strings.add(label) for label in labels)
|
np_deps = set(doc.vocab.strings.add(label) for label in labels)
|
||||||
close_app = doc.vocab.strings.add("nk")
|
close_app = doc.vocab.strings.add("nk")
|
||||||
|
|
||||||
rbracket = 0
|
rbracket = 0
|
||||||
for i, word in enumerate(doclike):
|
for i, word in enumerate(doclike):
|
||||||
if i < rbracket:
|
if i < rbracket:
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
|
@ -254,4 +256,4 @@ for orth in [
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
9
spacy/lang/defaults.py
Normal file
9
spacy/lang/defaults.py
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
from typing import Pattern
|
||||||
|
|
||||||
|
from .tokenizer_exceptions import URL_MATCH
|
||||||
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.xx.url_match")
|
||||||
|
def url_match() -> Pattern:
|
||||||
|
return URL_MATCH
|
|
@ -5,11 +5,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import GreekLemmatizer
|
from .lemmatizer import GreekLemmatizer
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import noun_chunks
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -17,6 +16,7 @@ DEFAULT_CONFIG = """
|
||||||
lang = "el"
|
lang = "el"
|
||||||
stop_words = {"@language_data": "spacy.el.stop_words"}
|
stop_words = {"@language_data": "spacy.el.stop_words"}
|
||||||
lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
|
lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
|
||||||
|
get_noun_chunks = {"@language_data": "spacy.el.get_noun_chunks"}
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.GreekLemmatizer.v1"
|
@lemmatizers = "spacy.GreekLemmatizer.v1"
|
||||||
|
@ -38,6 +38,11 @@ def create_greek_lemmatizer(data: Dict[str, dict] = {}) -> GreekLemmatizer:
|
||||||
return GreekLemmatizer(data=data)
|
return GreekLemmatizer(data=data)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.el.get_noun_chunks")
|
||||||
|
def get_noun_chunks() -> Callable:
|
||||||
|
return noun_chunks
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.el.stop_words")
|
@registry.language_data("spacy.el.stop_words")
|
||||||
def stop_words() -> Set[str]:
|
def stop_words() -> Set[str]:
|
||||||
return STOP_WORDS
|
return STOP_WORDS
|
||||||
|
@ -49,11 +54,10 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
|
||||||
|
|
||||||
class GreekDefaults(Language.Defaults):
|
class GreekDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
|
||||||
|
|
||||||
|
|
||||||
class Greek(Language):
|
class Greek(Language):
|
||||||
|
|
|
@ -1,21 +1,20 @@
|
||||||
|
from typing import Union, Iterator
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike):
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
"""
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
Detect base noun phrases. Works on both Doc and Span.
|
|
||||||
"""
|
|
||||||
# It follows the logic of the noun chunks finder of English language,
|
# It follows the logic of the noun chunks finder of English language,
|
||||||
# adjusted to some Greek language special characteristics.
|
# adjusted to some Greek language special characteristics.
|
||||||
# obj tag corrects some DEP tagger mistakes.
|
# obj tag corrects some DEP tagger mistakes.
|
||||||
# Further improvement of the models will eliminate the need for this tag.
|
# Further improvement of the models will eliminate the need for this tag.
|
||||||
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
|
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
nmod = doc.vocab.strings.add("nmod")
|
nmod = doc.vocab.strings.add("nmod")
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,5 +1,6 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, LEMMA, NORM
|
from ...symbols import ORTH, LEMMA, NORM
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
@ -392,4 +393,4 @@ for orth in [
|
||||||
]:
|
]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -4,13 +4,12 @@ from thinc.api import Config
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import noun_chunks
|
||||||
from .lemmatizer import is_base_form
|
from .lemmatizer import is_base_form
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lemmatizer import Lemmatizer
|
from ...lemmatizer import Lemmatizer
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -18,6 +17,7 @@ DEFAULT_CONFIG = """
|
||||||
lang = "en"
|
lang = "en"
|
||||||
stop_words = {"@language_data": "spacy.en.stop_words"}
|
stop_words = {"@language_data": "spacy.en.stop_words"}
|
||||||
lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
|
lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
|
||||||
|
get_noun_chunks = {"@language_data": "spacy.en.get_noun_chunks"}
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.EnglishLemmatizer.v1"
|
@lemmatizers = "spacy.EnglishLemmatizer.v1"
|
||||||
|
@ -49,9 +49,13 @@ def create_lemmatizer(data: Dict[str, dict] = {}) -> "Lemmatizer":
|
||||||
return Lemmatizer(data=data, is_base_form=is_base_form)
|
return Lemmatizer(data=data, is_base_form=is_base_form)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.en.get_noun_chunks")
|
||||||
|
def get_noun_chunks() -> Callable:
|
||||||
|
return noun_chunks
|
||||||
|
|
||||||
|
|
||||||
class EnglishDefaults(Language.Defaults):
|
class EnglishDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,27 +1,18 @@
|
||||||
|
from typing import Union, Iterator
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike):
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
"""
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
# fmt: off
|
||||||
"""
|
labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"]
|
||||||
labels = [
|
# fmt: on
|
||||||
"nsubj",
|
|
||||||
"dobj",
|
|
||||||
"nsubjpass",
|
|
||||||
"pcomp",
|
|
||||||
"pobj",
|
|
||||||
"dative",
|
|
||||||
"appos",
|
|
||||||
"attr",
|
|
||||||
"ROOT",
|
|
||||||
]
|
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -555,4 +557,4 @@ for string in _exclude:
|
||||||
_exc.pop(string)
|
_exc.pop(string)
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -4,11 +4,10 @@ from thinc.config import Config
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import noun_chunks
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -16,6 +15,7 @@ DEFAULT_CONFIG = """
|
||||||
lang = "es"
|
lang = "es"
|
||||||
stop_words = {"@language_data": "spacy.es.stop_words"}
|
stop_words = {"@language_data": "spacy.es.stop_words"}
|
||||||
lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
|
lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
|
||||||
|
get_noun_chunks = {"@language_data": "spacy.es.get_noun_chunks"}
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
@ -32,6 +32,11 @@ tables = ["lexeme_cluster", "lexeme_prob", "lexeme_settings"]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.es.get_noun_chunks")
|
||||||
|
def get_noun_chunks() -> Callable:
|
||||||
|
return noun_chunks
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.es.stop_words")
|
@registry.language_data("spacy.es.stop_words")
|
||||||
def stop_words() -> Set[str]:
|
def stop_words() -> Set[str]:
|
||||||
return STOP_WORDS
|
return STOP_WORDS
|
||||||
|
@ -43,10 +48,9 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
|
||||||
|
|
||||||
class SpanishDefaults(Language.Defaults):
|
class SpanishDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
|
||||||
|
|
||||||
|
|
||||||
class Spanish(Language):
|
class Spanish(Language):
|
||||||
|
|
|
@ -1,13 +1,15 @@
|
||||||
|
from typing import Union, Iterator, Optional, List, Tuple
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...tokens import Doc, Span, Token
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike):
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
doc = doclike.doc
|
doc = doclike.doc
|
||||||
|
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
if not len(doc):
|
if not len(doc):
|
||||||
return
|
return
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
@ -28,18 +30,24 @@ def noun_chunks(doclike):
|
||||||
token = next_token(token)
|
token = next_token(token)
|
||||||
|
|
||||||
|
|
||||||
def is_verb_token(token):
|
def is_verb_token(token: Token) -> bool:
|
||||||
return token.pos in [VERB, AUX]
|
return token.pos in [VERB, AUX]
|
||||||
|
|
||||||
|
|
||||||
def next_token(token):
|
def next_token(token: Token) -> Optional[Token]:
|
||||||
try:
|
try:
|
||||||
return token.nbor()
|
return token.nbor()
|
||||||
except IndexError:
|
except IndexError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
|
def noun_bounds(
|
||||||
|
doc: Doc,
|
||||||
|
root: Token,
|
||||||
|
np_left_deps: List[str],
|
||||||
|
np_right_deps: List[str],
|
||||||
|
stop_deps: List[str],
|
||||||
|
) -> Tuple[Token, Token]:
|
||||||
left_bound = root
|
left_bound = root
|
||||||
for token in reversed(list(root.lefts)):
|
for token in reversed(list(root.lefts)):
|
||||||
if token.dep in np_left_deps:
|
if token.dep in np_left_deps:
|
||||||
|
@ -50,12 +58,8 @@ def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
|
||||||
left, right = noun_bounds(
|
left, right = noun_bounds(
|
||||||
doc, token, np_left_deps, np_right_deps, stop_deps
|
doc, token, np_left_deps, np_right_deps, stop_deps
|
||||||
)
|
)
|
||||||
if list(
|
filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
|
||||||
filter(
|
if list(filter(filter_func, doc[left_bound.i : right.i],)):
|
||||||
lambda t: is_verb_token(t) or t.dep in stop_deps,
|
|
||||||
doc[left_bound.i : right.i],
|
|
||||||
)
|
|
||||||
):
|
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
right_bound = right
|
right_bound = right
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA
|
from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
|
@ -73,4 +75,4 @@ for orth in [
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -2,12 +2,12 @@ from typing import Set, Dict, Callable, Any
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import noun_chunks
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -15,6 +15,7 @@ DEFAULT_CONFIG = """
|
||||||
lang = "fa"
|
lang = "fa"
|
||||||
stop_words = {"@language_data": "spacy.fa.stop_words"}
|
stop_words = {"@language_data": "spacy.fa.stop_words"}
|
||||||
lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"}
|
lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"}
|
||||||
|
get_noun_chunks = {"@language_data": "spacy.de.get_noun_chunks"}
|
||||||
|
|
||||||
[nlp.writing_system]
|
[nlp.writing_system]
|
||||||
direction = "rtl"
|
direction = "rtl"
|
||||||
|
@ -41,10 +42,14 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
return LEX_ATTRS
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.fa.get_noun_chunks")
|
||||||
|
def get_noun_chunks() -> Callable:
|
||||||
|
return noun_chunks
|
||||||
|
|
||||||
|
|
||||||
class PersianDefaults(Language.Defaults):
|
class PersianDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
|
||||||
|
|
||||||
|
|
||||||
class Persian(Language):
|
class Persian(Language):
|
||||||
|
|
|
@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -31,7 +30,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
class FinnishDefaults(Language.Defaults):
|
class FinnishDefaults(Language.Defaults):
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class Finnish(Language):
|
class Finnish(Language):
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, LEMMA
|
from ...symbols import ORTH, LEMMA
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -78,4 +80,4 @@ for exc_data in [
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Set, Dict, Callable, Any
|
from typing import Set, Dict, Callable, Any, Pattern
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||||
|
@ -7,10 +7,9 @@ from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import FrenchLemmatizer, is_base_form
|
from .lemmatizer import FrenchLemmatizer, is_base_form
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import noun_chunks
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -18,6 +17,11 @@ DEFAULT_CONFIG = """
|
||||||
lang = "fr"
|
lang = "fr"
|
||||||
stop_words = {"@language_data": "spacy.fr.stop_words"}
|
stop_words = {"@language_data": "spacy.fr.stop_words"}
|
||||||
lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
|
lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
|
||||||
|
get_noun_chunks = {"@language_data": "spacy.fr.get_noun_chunks"}
|
||||||
|
|
||||||
|
[nlp.tokenizer]
|
||||||
|
@tokenizers = "spacy.Tokenizer.v1"
|
||||||
|
token_match = {"@language_data": "spacy.fr.token_match"}
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.FrenchLemmatizer.v1"
|
@lemmatizers = "spacy.FrenchLemmatizer.v1"
|
||||||
|
@ -34,6 +38,11 @@ def create_french_lemmatizer(data: Dict[str, dict] = {}) -> FrenchLemmatizer:
|
||||||
return FrenchLemmatizer(data=data, is_base_form=is_base_form)
|
return FrenchLemmatizer(data=data, is_base_form=is_base_form)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.fr.token_match")
|
||||||
|
def token_match() -> Pattern:
|
||||||
|
return TOKEN_MATCH
|
||||||
|
|
||||||
|
|
||||||
@registry.language_data("spacy.fr.stop_words")
|
@registry.language_data("spacy.fr.stop_words")
|
||||||
def stop_words() -> Set[str]:
|
def stop_words() -> Set[str]:
|
||||||
return STOP_WORDS
|
return STOP_WORDS
|
||||||
|
@ -44,13 +53,16 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
return LEX_ATTRS
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.fr.get_noun_chunks")
|
||||||
|
def get_noun_chunks() -> Callable:
|
||||||
|
return noun_chunks
|
||||||
|
|
||||||
|
|
||||||
class FrenchDefaults(Language.Defaults):
|
class FrenchDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
token_match = TOKEN_MATCH
|
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
|
||||||
|
|
||||||
|
|
||||||
class French(Language):
|
class French(Language):
|
||||||
|
|
|
@ -1,26 +1,18 @@
|
||||||
|
from typing import Union, Iterator
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike):
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
"""
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
# fmt: off
|
||||||
"""
|
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
labels = [
|
# fmt: on
|
||||||
"nsubj",
|
|
||||||
"nsubj:pass",
|
|
||||||
"obj",
|
|
||||||
"iobj",
|
|
||||||
"ROOT",
|
|
||||||
"appos",
|
|
||||||
"nmod",
|
|
||||||
"nmod:poss",
|
|
||||||
]
|
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
|
|
@ -1,8 +1,11 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from .punctuation import ELISION, HYPHENS
|
from .punctuation import ELISION, HYPHENS
|
||||||
from ..char_classes import ALPHA_LOWER, ALPHA
|
from ..char_classes import ALPHA_LOWER, ALPHA
|
||||||
from ...symbols import ORTH, LEMMA
|
from ...symbols import ORTH, LEMMA
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
|
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
|
||||||
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
|
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
|
||||||
|
@ -452,7 +455,7 @@ _regular_exp += [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
TOKEN_MATCH = re.compile(
|
TOKEN_MATCH = re.compile(
|
||||||
"(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
|
"(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
|
||||||
).match
|
).match
|
||||||
|
|
|
@ -3,9 +3,8 @@ from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -21,7 +20,7 @@ def stop_words() -> Set[str]:
|
||||||
|
|
||||||
|
|
||||||
class IrishDefaults(Language.Defaults):
|
class IrishDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class Irish(Language):
|
class Irish(Language):
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX
|
from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX
|
||||||
from ...symbols import ORTH, LEMMA, NORM
|
from ...symbols import ORTH, LEMMA, NORM
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
|
@ -81,4 +83,4 @@ for orth in ["d'", "D'"]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -4,7 +4,7 @@ from thinc.api import Config
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -25,7 +25,7 @@ def stop_words() -> Set[str]:
|
||||||
|
|
||||||
|
|
||||||
class HebrewDefaults(Language.Defaults):
|
class HebrewDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class Hebrew(Language):
|
class Hebrew(Language):
|
||||||
|
|
|
@ -4,7 +4,7 @@ from thinc.api import Config
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -28,7 +28,7 @@ def stop_words() -> Set[str]:
|
||||||
|
|
||||||
|
|
||||||
class CroatianDefaults(Language.Defaults):
|
class CroatianDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class Croatian(Language):
|
class Croatian(Language):
|
||||||
|
|
|
@ -1,12 +1,11 @@
|
||||||
from typing import Set
|
from typing import Set, Pattern
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -14,6 +13,10 @@ DEFAULT_CONFIG = """
|
||||||
lang = "hu"
|
lang = "hu"
|
||||||
stop_words = {"@language_data": "spacy.hu.stop_words"}
|
stop_words = {"@language_data": "spacy.hu.stop_words"}
|
||||||
|
|
||||||
|
[nlp.tokenizer]
|
||||||
|
@tokenizers = "spacy.Tokenizer.v1"
|
||||||
|
token_match = {"@language_data": "spacy.hu.token_match"}
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
@ -29,12 +32,16 @@ def stop_words() -> Set[str]:
|
||||||
return STOP_WORDS
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.hu.token_match")
|
||||||
|
def token_match() -> Pattern:
|
||||||
|
return TOKEN_MATCH
|
||||||
|
|
||||||
|
|
||||||
class HungarianDefaults(Language.Defaults):
|
class HungarianDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
token_match = TOKEN_MATCH
|
|
||||||
|
|
||||||
|
|
||||||
class Hungarian(Language):
|
class Hungarian(Language):
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..punctuation import ALPHA_LOWER, CURRENCY
|
from ..punctuation import ALPHA_LOWER, CURRENCY
|
||||||
from ...symbols import ORTH
|
from ...symbols import ORTH
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -644,5 +646,5 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match
|
TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match
|
||||||
|
|
|
@ -5,10 +5,9 @@ from .stop_words import STOP_WORDS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import noun_chunks
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -16,6 +15,7 @@ DEFAULT_CONFIG = """
|
||||||
lang = "id"
|
lang = "id"
|
||||||
stop_words = {"@language_data": "spacy.id.stop_words"}
|
stop_words = {"@language_data": "spacy.id.stop_words"}
|
||||||
lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
|
lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
|
||||||
|
get_noun_chunks = {"@language_data": "spacy.id.get_noun_chunks"}
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
@ -42,12 +42,16 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
return LEX_ATTRS
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.id.get_noun_chunks")
|
||||||
|
def get_noun_chunks() -> Callable:
|
||||||
|
return noun_chunks
|
||||||
|
|
||||||
|
|
||||||
class IndonesianDefaults(Language.Defaults):
|
class IndonesianDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
|
||||||
|
|
||||||
|
|
||||||
class Indonesian(Language):
|
class Indonesian(Language):
|
||||||
|
|
|
@ -1,26 +1,20 @@
|
||||||
|
from typing import Union, Iterator
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike):
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
labels = [
|
# fmt: off
|
||||||
"nsubj",
|
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
"nsubj:pass",
|
# fmt: on
|
||||||
"obj",
|
|
||||||
"iobj",
|
|
||||||
"ROOT",
|
|
||||||
"appos",
|
|
||||||
"nmod",
|
|
||||||
"nmod:poss",
|
|
||||||
]
|
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
|
from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, LEMMA, NORM
|
from ...symbols import ORTH, LEMMA, NORM
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
# Daftar singkatan dan Akronim dari:
|
# Daftar singkatan dan Akronim dari:
|
||||||
# https://id.wiktionary.org/wiki/Wiktionary:Daftar_singkatan_dan_akronim_bahasa_Indonesia#A
|
# https://id.wiktionary.org/wiki/Wiktionary:Daftar_singkatan_dan_akronim_bahasa_Indonesia#A
|
||||||
|
@ -221,4 +224,4 @@ for orth in [
|
||||||
]:
|
]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -4,9 +4,8 @@ from thinc.api import Config
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -30,7 +29,7 @@ def stop_words() -> Set[str]:
|
||||||
|
|
||||||
|
|
||||||
class ItalianDefaults(Language.Defaults):
|
class ItalianDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
|
@ -1,4 +1,7 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, LEMMA
|
from ...symbols import ORTH, LEMMA
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
"all'art.": [{ORTH: "all'"}, {ORTH: "art."}],
|
"all'art.": [{ORTH: "all'"}, {ORTH: "art."}],
|
||||||
|
@ -52,4 +55,4 @@ for orth in [
|
||||||
]:
|
]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
from typing import Optional, Union, Dict, Any, Set
|
from typing import Optional, Union, Dict, Any, Set, Callable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import noun_chunks
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .tag_orth_map import TAG_ORTH_MAP
|
from .tag_orth_map import TAG_ORTH_MAP
|
||||||
from .tag_bigram_map import TAG_BIGRAM_MAP
|
from .tag_bigram_map import TAG_BIGRAM_MAP
|
||||||
|
@ -22,6 +22,7 @@ DEFAULT_CONFIG = """
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "ja"
|
lang = "ja"
|
||||||
stop_words = {"@language_data": "spacy.ja.stop_words"}
|
stop_words = {"@language_data": "spacy.ja.stop_words"}
|
||||||
|
get_noun_chunks = {"@language_data": "spacy.ja.get_noun_chunks"}
|
||||||
|
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.JapaneseTokenizer.v1"
|
@tokenizers = "spacy.JapaneseTokenizer.v1"
|
||||||
|
@ -39,6 +40,11 @@ def stop_words() -> Set[str]:
|
||||||
return STOP_WORDS
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.ja.get_noun_chunks")
|
||||||
|
def get_noun_chunks() -> Callable:
|
||||||
|
return noun_chunks
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.JapaneseTokenizer.v1")
|
@registry.tokenizers("spacy.JapaneseTokenizer.v1")
|
||||||
def create_japanese_tokenizer(split_mode: Optional[str] = None):
|
def create_japanese_tokenizer(split_mode: Optional[str] = None):
|
||||||
def japanese_tokenizer_factory(nlp):
|
def japanese_tokenizer_factory(nlp):
|
||||||
|
@ -50,6 +56,8 @@ def create_japanese_tokenizer(split_mode: Optional[str] = None):
|
||||||
class JapaneseTokenizer(DummyTokenizer):
|
class JapaneseTokenizer(DummyTokenizer):
|
||||||
def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
|
def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
|
||||||
self.vocab = nlp.vocab
|
self.vocab = nlp.vocab
|
||||||
|
# TODO: is this the right way to do it?
|
||||||
|
self.vocab.morphology.load_tag_map(TAG_MAP)
|
||||||
self.split_mode = split_mode
|
self.split_mode = split_mode
|
||||||
self.tokenizer = try_sudachi_import(self.split_mode)
|
self.tokenizer = try_sudachi_import(self.split_mode)
|
||||||
|
|
||||||
|
@ -171,14 +179,8 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
class JapaneseDefaults(Language.Defaults):
|
|
||||||
tag_map = TAG_MAP
|
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
|
||||||
|
|
||||||
|
|
||||||
class Japanese(Language):
|
class Japanese(Language):
|
||||||
lang = "ja"
|
lang = "ja"
|
||||||
Defaults = JapaneseDefaults
|
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,33 +1,23 @@
|
||||||
|
from typing import Union, Iterator
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON, VERB
|
from ...symbols import NOUN, PROPN, PRON, VERB
|
||||||
|
from ...tokens import Doc, Span
|
||||||
# XXX this can probably be pruned a bit
|
|
||||||
labels = [
|
|
||||||
"nsubj",
|
|
||||||
"nmod",
|
|
||||||
"dobj",
|
|
||||||
"nsubjpass",
|
|
||||||
"pcomp",
|
|
||||||
"pobj",
|
|
||||||
"obj",
|
|
||||||
"obl",
|
|
||||||
"dative",
|
|
||||||
"appos",
|
|
||||||
"attr",
|
|
||||||
"ROOT",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(obj):
|
# TODO: this can probably be pruned a bit
|
||||||
"""
|
# fmt: off
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
labels = ["nsubj", "nmod", "ddoclike", "nsubjpass", "pcomp", "pdoclike", "doclike", "obl", "dative", "appos", "attr", "ROOT"]
|
||||||
"""
|
# fmt: on
|
||||||
|
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
|
||||||
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
doc.vocab.strings.add("conj")
|
doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
seen = set()
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
|
@ -37,12 +27,10 @@ def noun_chunks(obj):
|
||||||
unseen = [w.i for w in word.subtree if w.i not in seen]
|
unseen = [w.i for w in word.subtree if w.i not in seen]
|
||||||
if not unseen:
|
if not unseen:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# this takes care of particles etc.
|
# this takes care of particles etc.
|
||||||
seen.update(j.i for j in word.subtree)
|
seen.update(j.i for j in word.subtree)
|
||||||
# This avoids duplicating embedded clauses
|
# This avoids duplicating embedded clauses
|
||||||
seen.update(range(word.i + 1))
|
seen.update(range(word.i + 1))
|
||||||
|
|
||||||
# if the head of this is a verb, mark that and rights seen
|
# if the head of this is a verb, mark that and rights seen
|
||||||
# Don't do the subtree as that can hide other phrases
|
# Don't do the subtree as that can hide other phrases
|
||||||
if word.head.pos == VERB:
|
if word.head.pos == VERB:
|
||||||
|
|
|
@ -40,6 +40,8 @@ def create_korean_tokenizer():
|
||||||
class KoreanTokenizer(DummyTokenizer):
|
class KoreanTokenizer(DummyTokenizer):
|
||||||
def __init__(self, nlp: Optional[Language] = None):
|
def __init__(self, nlp: Optional[Language] = None):
|
||||||
self.vocab = nlp.vocab
|
self.vocab = nlp.vocab
|
||||||
|
# TODO: is this the right way to do it?
|
||||||
|
self.vocab.morphology.load_tag_map(TAG_MAP)
|
||||||
MeCab = try_mecab_import()
|
MeCab = try_mecab_import()
|
||||||
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
|
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
|
||||||
|
|
||||||
|
@ -72,13 +74,8 @@ class KoreanTokenizer(DummyTokenizer):
|
||||||
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||||
|
|
||||||
|
|
||||||
class KoreanDefaults(Language.Defaults):
|
|
||||||
tag_map = TAG_MAP
|
|
||||||
|
|
||||||
|
|
||||||
class Korean(Language):
|
class Korean(Language):
|
||||||
lang = "ko"
|
lang = "ko"
|
||||||
Defaults = KoreanDefaults
|
|
||||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -42,7 +41,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
|
||||||
|
|
||||||
class LuxembourgishDefaults(Language.Defaults):
|
class LuxembourgishDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,7 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, LEMMA, NORM
|
from ...symbols import ORTH, LEMMA, NORM
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
# TODO
|
# TODO
|
||||||
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
|
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
|
||||||
|
@ -47,4 +50,4 @@ for orth in [
|
||||||
]:
|
]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -4,9 +4,8 @@ from thinc.api import Config
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -22,7 +21,7 @@ def stop_words() -> Set[str]:
|
||||||
|
|
||||||
|
|
||||||
class LigurianDefaults(Language.Defaults):
|
class LigurianDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,7 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, LEMMA
|
from ...symbols import ORTH, LEMMA
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
@ -47,4 +50,4 @@ for prep, prep_lemma in [
|
||||||
{ORTH: prep, LEMMA: prep_lemma},
|
{ORTH: prep, LEMMA: prep_lemma},
|
||||||
]
|
]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -5,9 +5,8 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -39,11 +38,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
class LithuanianDefaults(Language.Defaults):
|
class LithuanianDefaults(Language.Defaults):
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
mod_base_exceptions = {
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
|
|
||||||
}
|
|
||||||
del mod_base_exceptions["8)"]
|
|
||||||
tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
|
|
||||||
|
|
||||||
|
|
||||||
class Lithuanian(Language):
|
class Lithuanian(Language):
|
||||||
|
|
|
@ -1,267 +1,15 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH
|
from ...symbols import ORTH
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
for orth in [
|
for orth in ["n-tosios", "?!"]:
|
||||||
"n-tosios",
|
|
||||||
"?!",
|
|
||||||
# "G.",
|
|
||||||
# "J. E.",
|
|
||||||
# "J. Em.",
|
|
||||||
# "J.E.",
|
|
||||||
# "J.Em.",
|
|
||||||
# "K.",
|
|
||||||
# "N.",
|
|
||||||
# "V.",
|
|
||||||
# "Vt.",
|
|
||||||
# "a.",
|
|
||||||
# "a.k.",
|
|
||||||
# "a.s.",
|
|
||||||
# "adv.",
|
|
||||||
# "akad.",
|
|
||||||
# "aklg.",
|
|
||||||
# "akt.",
|
|
||||||
# "al.",
|
|
||||||
# "ang.",
|
|
||||||
# "angl.",
|
|
||||||
# "aps.",
|
|
||||||
# "apskr.",
|
|
||||||
# "apyg.",
|
|
||||||
# "arbat.",
|
|
||||||
# "asist.",
|
|
||||||
# "asm.",
|
|
||||||
# "asm.k.",
|
|
||||||
# "asmv.",
|
|
||||||
# "atk.",
|
|
||||||
# "atsak.",
|
|
||||||
# "atsisk.",
|
|
||||||
# "atsisk.sąsk.",
|
|
||||||
# "atv.",
|
|
||||||
# "aut.",
|
|
||||||
# "avd.",
|
|
||||||
# "b.k.",
|
|
||||||
# "baud.",
|
|
||||||
# "biol.",
|
|
||||||
# "bkl.",
|
|
||||||
# "bot.",
|
|
||||||
# "bt.",
|
|
||||||
# "buv.",
|
|
||||||
# "ch.",
|
|
||||||
# "chem.",
|
|
||||||
# "corp.",
|
|
||||||
# "d.",
|
|
||||||
# "dab.",
|
|
||||||
# "dail.",
|
|
||||||
# "dek.",
|
|
||||||
# "deš.",
|
|
||||||
# "dir.",
|
|
||||||
# "dirig.",
|
|
||||||
# "doc.",
|
|
||||||
# "dol.",
|
|
||||||
# "dr.",
|
|
||||||
# "drp.",
|
|
||||||
# "dvit.",
|
|
||||||
# "dėst.",
|
|
||||||
# "dš.",
|
|
||||||
# "dž.",
|
|
||||||
# "e.b.",
|
|
||||||
# "e.bankas",
|
|
||||||
# "e.p.",
|
|
||||||
# "e.parašas",
|
|
||||||
# "e.paštas",
|
|
||||||
# "e.v.",
|
|
||||||
# "e.valdžia",
|
|
||||||
# "egz.",
|
|
||||||
# "eil.",
|
|
||||||
# "ekon.",
|
|
||||||
# "el.",
|
|
||||||
# "el.bankas",
|
|
||||||
# "el.p.",
|
|
||||||
# "el.parašas",
|
|
||||||
# "el.paštas",
|
|
||||||
# "el.valdžia",
|
|
||||||
# "etc.",
|
|
||||||
# "ež.",
|
|
||||||
# "fak.",
|
|
||||||
# "faks.",
|
|
||||||
# "feat.",
|
|
||||||
# "filol.",
|
|
||||||
# "filos.",
|
|
||||||
# "g.",
|
|
||||||
# "gen.",
|
|
||||||
# "geol.",
|
|
||||||
# "gerb.",
|
|
||||||
# "gim.",
|
|
||||||
# "gr.",
|
|
||||||
# "gv.",
|
|
||||||
# "gyd.",
|
|
||||||
# "gyv.",
|
|
||||||
# "habil.",
|
|
||||||
# "inc.",
|
|
||||||
# "insp.",
|
|
||||||
# "inž.",
|
|
||||||
# "ir pan.",
|
|
||||||
# "ir t. t.",
|
|
||||||
# "isp.",
|
|
||||||
# "istor.",
|
|
||||||
# "it.",
|
|
||||||
# "just.",
|
|
||||||
# "k.",
|
|
||||||
# "k. a.",
|
|
||||||
# "k.a.",
|
|
||||||
# "kab.",
|
|
||||||
# "kand.",
|
|
||||||
# "kart.",
|
|
||||||
# "kat.",
|
|
||||||
# "ketv.",
|
|
||||||
# "kh.",
|
|
||||||
# "kl.",
|
|
||||||
# "kln.",
|
|
||||||
# "km.",
|
|
||||||
# "kn.",
|
|
||||||
# "koresp.",
|
|
||||||
# "kpt.",
|
|
||||||
# "kr.",
|
|
||||||
# "kt.",
|
|
||||||
# "kub.",
|
|
||||||
# "kun.",
|
|
||||||
# "kv.",
|
|
||||||
# "kyš.",
|
|
||||||
# "l. e. p.",
|
|
||||||
# "l.e.p.",
|
|
||||||
# "lenk.",
|
|
||||||
# "liet.",
|
|
||||||
# "lot.",
|
|
||||||
# "lt.",
|
|
||||||
# "ltd.",
|
|
||||||
# "ltn.",
|
|
||||||
# "m.",
|
|
||||||
# "m.e..",
|
|
||||||
# "m.m.",
|
|
||||||
# "mat.",
|
|
||||||
# "med.",
|
|
||||||
# "mgnt.",
|
|
||||||
# "mgr.",
|
|
||||||
# "min.",
|
|
||||||
# "mjr.",
|
|
||||||
# "ml.",
|
|
||||||
# "mln.",
|
|
||||||
# "mlrd.",
|
|
||||||
# "mob.",
|
|
||||||
# "mok.",
|
|
||||||
# "moksl.",
|
|
||||||
# "mokyt.",
|
|
||||||
# "mot.",
|
|
||||||
# "mr.",
|
|
||||||
# "mst.",
|
|
||||||
# "mstl.",
|
|
||||||
# "mėn.",
|
|
||||||
# "nkt.",
|
|
||||||
# "no.",
|
|
||||||
# "nr.",
|
|
||||||
# "ntk.",
|
|
||||||
# "nuotr.",
|
|
||||||
# "op.",
|
|
||||||
# "org.",
|
|
||||||
# "orig.",
|
|
||||||
# "p.",
|
|
||||||
# "p.d.",
|
|
||||||
# "p.m.e.",
|
|
||||||
# "p.s.",
|
|
||||||
# "pab.",
|
|
||||||
# "pan.",
|
|
||||||
# "past.",
|
|
||||||
# "pav.",
|
|
||||||
# "pavad.",
|
|
||||||
# "per.",
|
|
||||||
# "perd.",
|
|
||||||
# "pirm.",
|
|
||||||
# "pl.",
|
|
||||||
# "plg.",
|
|
||||||
# "plk.",
|
|
||||||
# "pr.",
|
|
||||||
# "pr.Kr.",
|
|
||||||
# "pranc.",
|
|
||||||
# "proc.",
|
|
||||||
# "prof.",
|
|
||||||
# "prom.",
|
|
||||||
# "prot.",
|
|
||||||
# "psl.",
|
|
||||||
# "pss.",
|
|
||||||
# "pvz.",
|
|
||||||
# "pšt.",
|
|
||||||
# "r.",
|
|
||||||
# "raj.",
|
|
||||||
# "red.",
|
|
||||||
# "rez.",
|
|
||||||
# "rež.",
|
|
||||||
# "rus.",
|
|
||||||
# "rš.",
|
|
||||||
# "s.",
|
|
||||||
# "sav.",
|
|
||||||
# "saviv.",
|
|
||||||
# "sek.",
|
|
||||||
# "sekr.",
|
|
||||||
# "sen.",
|
|
||||||
# "sh.",
|
|
||||||
# "sk.",
|
|
||||||
# "skg.",
|
|
||||||
# "skv.",
|
|
||||||
# "skyr.",
|
|
||||||
# "sp.",
|
|
||||||
# "spec.",
|
|
||||||
# "sr.",
|
|
||||||
# "st.",
|
|
||||||
# "str.",
|
|
||||||
# "stud.",
|
|
||||||
# "sąs.",
|
|
||||||
# "t.",
|
|
||||||
# "t. p.",
|
|
||||||
# "t. y.",
|
|
||||||
# "t.p.",
|
|
||||||
# "t.t.",
|
|
||||||
# "t.y.",
|
|
||||||
# "techn.",
|
|
||||||
# "tel.",
|
|
||||||
# "teol.",
|
|
||||||
# "th.",
|
|
||||||
# "tir.",
|
|
||||||
# "trit.",
|
|
||||||
# "trln.",
|
|
||||||
# "tšk.",
|
|
||||||
# "tūks.",
|
|
||||||
# "tūkst.",
|
|
||||||
# "up.",
|
|
||||||
# "upl.",
|
|
||||||
# "v.s.",
|
|
||||||
# "vad.",
|
|
||||||
# "val.",
|
|
||||||
# "valg.",
|
|
||||||
# "ved.",
|
|
||||||
# "vert.",
|
|
||||||
# "vet.",
|
|
||||||
# "vid.",
|
|
||||||
# "virš.",
|
|
||||||
# "vlsč.",
|
|
||||||
# "vnt.",
|
|
||||||
# "vok.",
|
|
||||||
# "vs.",
|
|
||||||
# "vtv.",
|
|
||||||
# "vv.",
|
|
||||||
# "vyr.",
|
|
||||||
# "vyresn.",
|
|
||||||
# "zool.",
|
|
||||||
# "Įn",
|
|
||||||
# "įl.",
|
|
||||||
# "š.m.",
|
|
||||||
# "šnek.",
|
|
||||||
# "šv.",
|
|
||||||
# "švč.",
|
|
||||||
# "ž.ū.",
|
|
||||||
# "žin.",
|
|
||||||
# "žml.",
|
|
||||||
# "žr.",
|
|
||||||
]:
|
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
mod_base_exceptions = {
|
||||||
|
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
|
||||||
|
}
|
||||||
|
del mod_base_exceptions["8)"]
|
||||||
|
TOKENIZER_EXCEPTIONS = update_exc(mod_base_exceptions, _exc)
|
||||||
|
|
|
@ -1,20 +1,20 @@
|
||||||
from typing import Set
|
from typing import Set, Callable
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import noun_chunks
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = "nb"
|
lang = "nb"
|
||||||
stop_words = {"@language_data": "spacy.nb.stop_words"}
|
stop_words = {"@language_data": "spacy.nb.stop_words"}
|
||||||
|
get_noun_chunks = {"@language_data": "spacy.nb.get_noun_chunks"}
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
@ -31,12 +31,16 @@ def stop_words() -> Set[str]:
|
||||||
return STOP_WORDS
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.nb.get_noun_chunks")
|
||||||
|
def get_noun_chunks() -> Callable:
|
||||||
|
return noun_chunks
|
||||||
|
|
||||||
|
|
||||||
class NorwegianDefaults(Language.Defaults):
|
class NorwegianDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
|
||||||
|
|
||||||
|
|
||||||
class Norwegian(Language):
|
class Norwegian(Language):
|
||||||
|
|
|
@ -1,26 +1,18 @@
|
||||||
|
from typing import Union, Iterator
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike):
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
"""
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
# fmt: off
|
||||||
"""
|
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
labels = [
|
# fmt: on
|
||||||
"nsubj",
|
|
||||||
"nsubj:pass",
|
|
||||||
"obj",
|
|
||||||
"iobj",
|
|
||||||
"ROOT",
|
|
||||||
"appos",
|
|
||||||
"nmod",
|
|
||||||
"nmod:poss",
|
|
||||||
]
|
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, LEMMA
|
from ...symbols import ORTH, LEMMA
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -218,4 +220,4 @@ for orth in [
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -7,9 +7,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .lemmatizer import DutchLemmatizer
|
from .lemmatizer import DutchLemmatizer
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -44,7 +43,7 @@ def create_dutch_lemmatizer(data: Dict[str, dict] = {}) -> DutchLemmatizer:
|
||||||
|
|
||||||
|
|
||||||
class DutchDefaults(Language.Defaults):
|
class DutchDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -1,4 +1,7 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH
|
from ...symbols import ORTH
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
# Extensive list of both common and uncommon dutch abbreviations copied from
|
# Extensive list of both common and uncommon dutch abbreviations copied from
|
||||||
# github.com/diasks2/pragmatic_segmenter, a Ruby library for rule-based
|
# github.com/diasks2/pragmatic_segmenter, a Ruby library for rule-based
|
||||||
|
@ -1602,4 +1605,4 @@ for orth in abbrevs:
|
||||||
_exc[i] = [{ORTH: i}]
|
_exc[i] = [{ORTH: i}]
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -4,10 +4,9 @@ from thinc.api import Config
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -37,7 +36,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
|
||||||
|
|
||||||
class PortugueseDefaults(Language.Defaults):
|
class PortugueseDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH
|
from ...symbols import ORTH
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -50,4 +52,4 @@ for orth in [
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -5,9 +5,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
# Lemma data note:
|
# Lemma data note:
|
||||||
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
|
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
|
||||||
|
@ -35,7 +34,7 @@ def stop_words() -> Set[str]:
|
||||||
|
|
||||||
|
|
||||||
class RomanianDefaults(Language.Defaults):
|
class RomanianDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH
|
from ...symbols import ORTH
|
||||||
|
from ...util import update_exc
|
||||||
from .punctuation import _make_ro_variants
|
from .punctuation import _make_ro_variants
|
||||||
|
|
||||||
|
|
||||||
|
@ -91,4 +93,4 @@ for orth in [
|
||||||
_exc[variant] = [{ORTH: variant}]
|
_exc[variant] = [{ORTH: variant}]
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -5,8 +5,7 @@ from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import RussianLemmatizer
|
from .lemmatizer import RussianLemmatizer
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...util import registry
|
||||||
from ...util import update_exc, registry
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
@ -42,7 +41,7 @@ def create_russian_lemmatizer() -> RussianLemmatizer:
|
||||||
|
|
||||||
|
|
||||||
class RussianDefaults(Language.Defaults):
|
class RussianDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class Russian(Language):
|
class Russian(Language):
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, LEMMA, NORM
|
from ...symbols import ORTH, LEMMA, NORM
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -63,4 +65,4 @@ for slang_desc in _slang_exc:
|
||||||
_exc[slang_desc[ORTH]] = [slang_desc]
|
_exc[slang_desc[ORTH]] = [slang_desc]
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -4,9 +4,8 @@ from thinc.api import Config
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -41,7 +40,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
|
||||||
|
|
||||||
class SerbianDefaults(Language.Defaults):
|
class SerbianDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class Serbian(Language):
|
class Serbian(Language):
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, LEMMA, NORM
|
from ...symbols import ORTH, LEMMA, NORM
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -90,4 +92,4 @@ for slang_desc in _slang_exc:
|
||||||
_exc[slang_desc[ORTH]] = [slang_desc]
|
_exc[slang_desc[ORTH]] = [slang_desc]
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -4,10 +4,9 @@ from thinc.api import Config
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import noun_chunks
|
||||||
|
|
||||||
# Punctuation stolen from Danish
|
# Punctuation stolen from Danish
|
||||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
|
@ -18,6 +17,7 @@ DEFAULT_CONFIG = """
|
||||||
lang = "sv"
|
lang = "sv"
|
||||||
stop_words = {"@language_data": "spacy.sv.stop_words"}
|
stop_words = {"@language_data": "spacy.sv.stop_words"}
|
||||||
lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
|
lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
|
||||||
|
get_noun_chunks = {"@language_data": "spacy.sv.get_noun_chunks"}
|
||||||
|
|
||||||
[nlp.lemmatizer]
|
[nlp.lemmatizer]
|
||||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
@ -39,11 +39,15 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
return LEX_ATTRS
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.sv.get_noun_chunks")
|
||||||
|
def get_noun_chunks() -> Callable:
|
||||||
|
return noun_chunks
|
||||||
|
|
||||||
|
|
||||||
class SwedishDefaults(Language.Defaults):
|
class SwedishDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
|
||||||
|
|
||||||
|
|
||||||
class Swedish(Language):
|
class Swedish(Language):
|
||||||
|
|
|
@ -1,27 +1,18 @@
|
||||||
|
from typing import Union, Iterator
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike):
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||||
"""
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
# fmt: off
|
||||||
"""
|
labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
labels = [
|
# fmt: on
|
||||||
"nsubj",
|
|
||||||
"nsubj:pass",
|
|
||||||
"dobj",
|
|
||||||
"obj",
|
|
||||||
"iobj",
|
|
||||||
"ROOT",
|
|
||||||
"appos",
|
|
||||||
"nmod",
|
|
||||||
"nmod:poss",
|
|
||||||
]
|
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA
|
from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
@ -154,4 +156,4 @@ for orth in ABBREVIATIONS:
|
||||||
for orth in ["i", "m"]:
|
for orth in ["i", "m"]:
|
||||||
_exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: "."}]
|
_exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: "."}]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -1,25 +0,0 @@
|
||||||
from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
|
|
||||||
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
|
|
||||||
|
|
||||||
|
|
||||||
TAG_MAP = {
|
|
||||||
"ADV": {POS: ADV},
|
|
||||||
"NOUN": {POS: NOUN},
|
|
||||||
"ADP": {POS: ADP},
|
|
||||||
"PRON": {POS: PRON},
|
|
||||||
"SCONJ": {POS: SCONJ},
|
|
||||||
"PROPN": {POS: PROPN},
|
|
||||||
"DET": {POS: DET},
|
|
||||||
"SYM": {POS: SYM},
|
|
||||||
"INTJ": {POS: INTJ},
|
|
||||||
"PUNCT": {POS: PUNCT},
|
|
||||||
"NUM": {POS: NUM},
|
|
||||||
"AUX": {POS: AUX},
|
|
||||||
"X": {POS: X},
|
|
||||||
"CONJ": {POS: CONJ},
|
|
||||||
"CCONJ": {POS: CCONJ},
|
|
||||||
"ADJ": {POS: ADJ},
|
|
||||||
"VERB": {POS: VERB},
|
|
||||||
"PART": {POS: PART},
|
|
||||||
"_SP": {POS: SPACE},
|
|
||||||
}
|
|
|
@ -4,9 +4,8 @@ from thinc.api import Config
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -36,7 +35,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
|
||||||
|
|
||||||
class TagalogDefaults(Language.Defaults):
|
class TagalogDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class Tagalog(Language):
|
class Tagalog(Language):
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, LEMMA
|
from ...symbols import ORTH, LEMMA
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
|
@ -14,4 +16,4 @@ _exc = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -55,7 +55,6 @@ URL_PATTERN = (
|
||||||
# fmt: on
|
# fmt: on
|
||||||
).strip()
|
).strip()
|
||||||
|
|
||||||
TOKEN_MATCH = None
|
|
||||||
URL_MATCH = re.compile("(?u)" + URL_PATTERN).match
|
URL_MATCH = re.compile("(?u)" + URL_PATTERN).match
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,9 +3,8 @@ from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -29,7 +28,7 @@ def stop_words() -> Set[str]:
|
||||||
|
|
||||||
|
|
||||||
class TurkishDefaults(Language.Defaults):
|
class TurkishDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class Turkish(Language):
|
class Turkish(Language):
|
||||||
|
|
|
@ -1,4 +1,7 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, NORM
|
from ...symbols import ORTH, NORM
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {"sağol": [{ORTH: "sağ"}, {ORTH: "ol", NORM: "olun"}]}
|
_exc = {"sağol": [{ORTH: "sağ"}, {ORTH: "ol", NORM: "olun"}]}
|
||||||
|
|
||||||
|
@ -113,4 +116,4 @@ for orth in ["Dr.", "yy."]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -5,9 +5,8 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc, registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -29,7 +28,7 @@ def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
|
||||||
|
|
||||||
class TatarDefaults(Language.Defaults):
|
class TatarDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = tuple(TOKENIZER_INFIXES)
|
infixes = tuple(TOKENIZER_INFIXES)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,7 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, LEMMA, NORM
|
from ...symbols import ORTH, LEMMA, NORM
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
@ -43,4 +46,4 @@ for exc_data in [ # "etc." abbreviations
|
||||||
exc_data[LEMMA] = exc_data[NORM]
|
exc_data[LEMMA] = exc_data[NORM]
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -4,8 +4,7 @@ from thinc.api import Config
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...util import registry
|
||||||
from ...util import update_exc, registry
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from .lemmatizer import UkrainianLemmatizer
|
from .lemmatizer import UkrainianLemmatizer
|
||||||
|
|
||||||
|
@ -37,7 +36,7 @@ def create_ukrainian_lemmatizer() -> UkrainianLemmatizer:
|
||||||
|
|
||||||
|
|
||||||
class UkrainianDefaults(Language.Defaults):
|
class UkrainianDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class Ukrainian(Language):
|
class Ukrainian(Language):
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, LEMMA, POS, NORM, NOUN
|
from ...symbols import ORTH, LEMMA, POS, NORM, NOUN
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -21,4 +23,4 @@ for exc_data in [
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -24,9 +24,7 @@ from .util import link_vectors_to_models, create_default_optimizer, registry
|
||||||
from .util import SimpleFrozenDict
|
from .util import SimpleFrozenDict
|
||||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
from .lang.punctuation import TOKENIZER_INFIXES
|
from .lang.punctuation import TOKENIZER_INFIXES
|
||||||
from .lang.tokenizer_exceptions import TOKEN_MATCH, URL_MATCH
|
from .tokens import Doc
|
||||||
from .lang.tag_map import TAG_MAP
|
|
||||||
from .tokens import Doc, Span
|
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors, Warnings
|
||||||
from .schemas import ConfigSchema
|
from .schemas import ConfigSchema
|
||||||
from .git_info import GIT_VERSION
|
from .git_info import GIT_VERSION
|
||||||
|
@ -37,6 +35,7 @@ from . import about
|
||||||
from .tokenizer import Tokenizer # noqa: F401
|
from .tokenizer import Tokenizer # noqa: F401
|
||||||
from .lemmatizer import Lemmatizer # noqa: F401
|
from .lemmatizer import Lemmatizer # noqa: F401
|
||||||
from .lookups import Lookups # noqa: F401
|
from .lookups import Lookups # noqa: F401
|
||||||
|
from .lang import defaults # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
ENABLE_PIPELINE_ANALYSIS = False
|
ENABLE_PIPELINE_ANALYSIS = False
|
||||||
|
@ -46,15 +45,10 @@ DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH)
|
||||||
|
|
||||||
|
|
||||||
class BaseDefaults:
|
class BaseDefaults:
|
||||||
token_match: Optional[Pattern] = TOKEN_MATCH
|
|
||||||
url_match: Pattern = URL_MATCH
|
|
||||||
prefixes: Tuple[Pattern, ...] = tuple(TOKENIZER_PREFIXES)
|
prefixes: Tuple[Pattern, ...] = tuple(TOKENIZER_PREFIXES)
|
||||||
suffixes: Tuple[Pattern, ...] = tuple(TOKENIZER_SUFFIXES)
|
suffixes: Tuple[Pattern, ...] = tuple(TOKENIZER_SUFFIXES)
|
||||||
infixes: Tuple[Pattern, ...] = tuple(TOKENIZER_INFIXES)
|
infixes: Tuple[Pattern, ...] = tuple(TOKENIZER_INFIXES)
|
||||||
tag_map: Dict[str, dict] = dict(TAG_MAP)
|
|
||||||
tokenizer_exceptions: Dict[str, List[dict]] = {}
|
tokenizer_exceptions: Dict[str, List[dict]] = {}
|
||||||
morph_rules: Dict[str, Dict[str, dict]] = {}
|
|
||||||
syntax_iterators: Dict[str, Callable[[Union[Doc, Span]], Iterator]] = {}
|
|
||||||
|
|
||||||
|
|
||||||
class Language:
|
class Language:
|
||||||
|
@ -114,13 +108,7 @@ class Language:
|
||||||
|
|
||||||
if vocab is True:
|
if vocab is True:
|
||||||
vectors_name = meta.get("vectors", {}).get("name")
|
vectors_name = meta.get("vectors", {}).get("name")
|
||||||
vocab = Vocab.from_config(
|
vocab = Vocab.from_config(self._config, vectors_name=vectors_name)
|
||||||
self._config,
|
|
||||||
vectors_name=vectors_name,
|
|
||||||
# TODO: what should we do with these?
|
|
||||||
tag_map=self.Defaults.tag_map,
|
|
||||||
morph_rules=self.Defaults.morph_rules,
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||||
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
||||||
|
@ -1267,15 +1255,14 @@ class Language:
|
||||||
lex_attr_getters = resolved["nlp"]["lex_attr_getters"]
|
lex_attr_getters = resolved["nlp"]["lex_attr_getters"]
|
||||||
stop_words = resolved["nlp"]["stop_words"]
|
stop_words = resolved["nlp"]["stop_words"]
|
||||||
vocab_data = resolved["nlp"]["vocab_data"]
|
vocab_data = resolved["nlp"]["vocab_data"]
|
||||||
|
get_noun_chunks = resolved["nlp"]["get_noun_chunks"]
|
||||||
vocab = Vocab.from_config(
|
vocab = Vocab.from_config(
|
||||||
filled,
|
filled,
|
||||||
lemmatizer=lemmatizer,
|
lemmatizer=lemmatizer,
|
||||||
lex_attr_getters=lex_attr_getters,
|
lex_attr_getters=lex_attr_getters,
|
||||||
stop_words=stop_words,
|
stop_words=stop_words,
|
||||||
vocab_data=vocab_data,
|
vocab_data=vocab_data,
|
||||||
# TODO: what should we do with these?
|
get_noun_chunks=get_noun_chunks,
|
||||||
tag_map=cls.Defaults.tag_map,
|
|
||||||
morph_rules=cls.Defaults.morph_rules,
|
|
||||||
)
|
)
|
||||||
nlp = cls(vocab, create_tokenizer=create_tokenizer)
|
nlp = cls(vocab, create_tokenizer=create_tokenizer)
|
||||||
pipeline = config.get("components", {})
|
pipeline = config.get("components", {})
|
||||||
|
|
|
@ -243,6 +243,7 @@ class ConfigSchemaNlp(BaseModel):
|
||||||
stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop")
|
stop_words: Sequence[StrictStr] = Field(..., title="Stop words to mark via Token/Lexeme.is_stop")
|
||||||
lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)")
|
lex_attr_getters: Dict[StrictStr, Callable] = Field(..., title="Custom getter functions for lexical attributes (e.g. like_num)")
|
||||||
vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables")
|
vocab_data: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Vocabulary data, e.g. lexeme normalization tables")
|
||||||
|
get_noun_chunks: Optional[Callable] = Field(..., title="Function to extract noun phrases from a Doc")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import numpy
|
import numpy
|
||||||
from spacy.attrs import HEAD, DEP
|
from spacy.attrs import HEAD, DEP
|
||||||
from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root
|
from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root
|
||||||
from spacy.lang.en.syntax_iterators import SYNTAX_ITERATORS
|
from spacy.lang.en.syntax_iterators import noun_chunks
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
@ -41,7 +41,7 @@ def test_en_noun_chunks_not_nested(en_vocab):
|
||||||
dtype="uint64",
|
dtype="uint64",
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
doc.noun_chunks_iterator = SYNTAX_ITERATORS["noun_chunks"]
|
doc.noun_chunks_iterator = noun_chunks
|
||||||
word_occurred = {}
|
word_occurred = {}
|
||||||
for chunk in doc.noun_chunks:
|
for chunk in doc.noun_chunks:
|
||||||
for word in chunk:
|
for word in chunk:
|
||||||
|
|
|
@ -9,7 +9,7 @@ from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
from typing import Dict, List, Union, Pattern, Optional
|
from typing import Dict, List, Union, Pattern, Optional, Any
|
||||||
import re
|
import re
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
|
@ -32,16 +32,16 @@ def create_tokenizer(
|
||||||
# prefixes: Optional[List[Union[str, Pattern]]],
|
# prefixes: Optional[List[Union[str, Pattern]]],
|
||||||
# suffixes: Optional[List[Union[str, Pattern]]],
|
# suffixes: Optional[List[Union[str, Pattern]]],
|
||||||
# infixes: Optional[List[Union[str, Pattern]]],
|
# infixes: Optional[List[Union[str, Pattern]]],
|
||||||
# token_match: Optional[Pattern],
|
# We currently can't validate against Pattern because that will cause
|
||||||
# url_match: Optional[Pattern],
|
# Pydantic to parse value *as* pattern
|
||||||
|
token_match: Optional[Any] = None,
|
||||||
|
url_match: Optional[Any] = None,
|
||||||
) -> "Tokenizer":
|
) -> "Tokenizer":
|
||||||
def tokenizer_factory(nlp):
|
def tokenizer_factory(nlp):
|
||||||
exceptions = nlp.Defaults.tokenizer_exceptions
|
exceptions = nlp.Defaults.tokenizer_exceptions
|
||||||
prefixes = nlp.Defaults.prefixes
|
prefixes = nlp.Defaults.prefixes
|
||||||
suffixes = nlp.Defaults.suffixes
|
suffixes = nlp.Defaults.suffixes
|
||||||
infixes = nlp.Defaults.infixes
|
infixes = nlp.Defaults.infixes
|
||||||
url_match = nlp.Defaults.url_match
|
|
||||||
token_match = nlp.Defaults.token_match
|
|
||||||
prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
|
prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
|
||||||
suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
|
suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
|
||||||
infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None
|
infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None
|
||||||
|
|
|
@ -89,16 +89,6 @@ cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name)
|
||||||
return get_token_attr(token, feat_name)
|
return get_token_attr(token, feat_name)
|
||||||
|
|
||||||
|
|
||||||
def _get_chunker(lang):
|
|
||||||
try:
|
|
||||||
cls = util.get_lang_class(lang)
|
|
||||||
except ImportError:
|
|
||||||
return None
|
|
||||||
except KeyError:
|
|
||||||
return None
|
|
||||||
return cls.Defaults.syntax_iterators.get("noun_chunks")
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Doc:
|
cdef class Doc:
|
||||||
"""A sequence of Token objects. Access sentences and named entities, export
|
"""A sequence of Token objects. Access sentences and named entities, export
|
||||||
annotations to numpy arrays, losslessly serialize to compressed binary
|
annotations to numpy arrays, losslessly serialize to compressed binary
|
||||||
|
@ -212,7 +202,7 @@ cdef class Doc:
|
||||||
self.tensor = numpy.zeros((0,), dtype="float32")
|
self.tensor = numpy.zeros((0,), dtype="float32")
|
||||||
self.user_data = {} if user_data is None else user_data
|
self.user_data = {} if user_data is None else user_data
|
||||||
self._vector = None
|
self._vector = None
|
||||||
self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
|
self.noun_chunks_iterator = self.vocab.get_noun_chunks
|
||||||
cdef bint has_space
|
cdef bint has_space
|
||||||
if words is None and spaces is not None:
|
if words is None and spaces is not None:
|
||||||
raise ValueError("words must be set if spaces is set")
|
raise ValueError("words must be set if spaces is set")
|
||||||
|
|
|
@ -30,6 +30,7 @@ cdef class Vocab:
|
||||||
cpdef public object vectors
|
cpdef public object vectors
|
||||||
cpdef public object lookups
|
cpdef public object lookups
|
||||||
cpdef public object writing_system
|
cpdef public object writing_system
|
||||||
|
cpdef public object get_noun_chunks
|
||||||
cdef readonly int length
|
cdef readonly int length
|
||||||
cdef public object data_dir
|
cdef public object data_dir
|
||||||
cdef public object lex_attr_getters
|
cdef public object lex_attr_getters
|
||||||
|
|
|
@ -30,10 +30,10 @@ cdef class Vocab:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab
|
DOCS: https://spacy.io/api/vocab
|
||||||
"""
|
"""
|
||||||
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
def __init__(self, lex_attr_getters=None, lemmatizer=None,
|
||||||
strings=tuple(), lookups=None, vocab_data={},
|
strings=tuple(), lookups=None, tag_map={}, vocab_data={},
|
||||||
oov_prob=-20., vectors_name=None, writing_system={},
|
oov_prob=-20., vectors_name=None, writing_system={},
|
||||||
**deprecated_kwargs):
|
get_noun_chunks=None, **deprecated_kwargs):
|
||||||
"""Create the vocabulary.
|
"""Create the vocabulary.
|
||||||
|
|
||||||
lex_attr_getters (dict): A dictionary mapping attribute IDs to
|
lex_attr_getters (dict): A dictionary mapping attribute IDs to
|
||||||
|
@ -49,7 +49,6 @@ cdef class Vocab:
|
||||||
RETURNS (Vocab): The newly constructed object.
|
RETURNS (Vocab): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||||
tag_map = tag_map if tag_map is not None else {}
|
|
||||||
if lookups in (None, True, False):
|
if lookups in (None, True, False):
|
||||||
lookups = Lookups()
|
lookups = Lookups()
|
||||||
for name, data in vocab_data.items():
|
for name, data in vocab_data.items():
|
||||||
|
@ -71,6 +70,7 @@ cdef class Vocab:
|
||||||
self.vectors = Vectors(name=vectors_name)
|
self.vectors = Vectors(name=vectors_name)
|
||||||
self.lookups = lookups
|
self.lookups = lookups
|
||||||
self.writing_system = writing_system
|
self.writing_system = writing_system
|
||||||
|
self.get_noun_chunks = get_noun_chunks
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def lang(self):
|
def lang(self):
|
||||||
|
@ -424,9 +424,8 @@ cdef class Vocab:
|
||||||
lex_attr_getters=None,
|
lex_attr_getters=None,
|
||||||
stop_words=None,
|
stop_words=None,
|
||||||
vocab_data=None,
|
vocab_data=None,
|
||||||
|
get_noun_chunks=None,
|
||||||
vectors_name=None,
|
vectors_name=None,
|
||||||
tag_map=None,
|
|
||||||
morph_rules=None
|
|
||||||
):
|
):
|
||||||
"""Create a Vocab from a config and (currently) language defaults, i.e.
|
"""Create a Vocab from a config and (currently) language defaults, i.e.
|
||||||
nlp.Defaults.
|
nlp.Defaults.
|
||||||
|
@ -449,6 +448,9 @@ cdef class Vocab:
|
||||||
if vocab_data is None:
|
if vocab_data is None:
|
||||||
vocab_data_cfg = {"vocab_data": config["nlp"]["vocab_data"]}
|
vocab_data_cfg = {"vocab_data": config["nlp"]["vocab_data"]}
|
||||||
vocab_data = registry.make_from_config(vocab_data_cfg)["vocab_data"]
|
vocab_data = registry.make_from_config(vocab_data_cfg)["vocab_data"]
|
||||||
|
if get_noun_chunks is None:
|
||||||
|
noun_chunks_cfg = {"get_noun_chunks": config["nlp"]["get_noun_chunks"]}
|
||||||
|
get_noun_chunks = registry.make_from_config(noun_chunks_cfg)["get_noun_chunks"]
|
||||||
if lex_attr_getters is None:
|
if lex_attr_getters is None:
|
||||||
lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]}
|
lex_attrs_cfg = {"lex_attr_getters": config["nlp"]["lex_attr_getters"]}
|
||||||
lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"]
|
lex_attr_getters = registry.make_from_config(lex_attrs_cfg)["lex_attr_getters"]
|
||||||
|
@ -468,10 +470,8 @@ cdef class Vocab:
|
||||||
vocab_data=vocab_data,
|
vocab_data=vocab_data,
|
||||||
lemmatizer=lemmatizer,
|
lemmatizer=lemmatizer,
|
||||||
writing_system=writing_system,
|
writing_system=writing_system,
|
||||||
tag_map=tag_map,
|
get_noun_chunks=get_noun_chunks
|
||||||
)
|
)
|
||||||
if morph_rules is not None:
|
|
||||||
vocab.morphology.load_morph_exceptions(morph_rules)
|
|
||||||
if vocab.vectors.name is None and vectors_name:
|
if vocab.vectors.name is None and vectors_name:
|
||||||
vocab.vectors.name = vectors_name
|
vocab.vectors.name = vectors_name
|
||||||
return vocab
|
return vocab
|
||||||
|
|
Loading…
Reference in New Issue
Block a user