mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-12 17:22:25 +03:00
Merge branch 'master' into fix/enum-python-types
This commit is contained in:
commit
79f9d3ea2a
|
@ -17,7 +17,6 @@ requests>=2.13.0,<3.0.0
|
|||
tqdm>=4.38.0,<5.0.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
|
||||
jinja2
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
|
|
|
@ -65,7 +65,6 @@ install_requires =
|
|||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
|
||||
[options.entry_points]
|
||||
console_scripts =
|
||||
|
|
52
spacy/lang/ht/__init__.py
Normal file
52
spacy/lang/ht/__init__.py
Normal file
|
@ -0,0 +1,52 @@
|
|||
from typing import Callable, Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from ...language import BaseDefaults, Language
|
||||
from .lemmatizer import HaitianCreoleLemmatizer
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .tag_map import TAG_MAP
|
||||
|
||||
|
||||
class HaitianCreoleDefaults(BaseDefaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
|
||||
class HaitianCreole(Language):
|
||||
lang = "ht"
|
||||
Defaults = HaitianCreoleDefaults
|
||||
|
||||
@HaitianCreole.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return HaitianCreoleLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
__all__ = ["HaitianCreole"]
|
18
spacy/lang/ht/examples.py
Normal file
18
spacy/lang/ht/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.ht.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple ap panse achte yon demaraj nan Wayòm Ini pou $1 milya dola",
|
||||
"Machin otonòm fè responsablite asirans lan ale sou men fabrikan yo",
|
||||
"San Francisco ap konsidere entèdi robo ki livre sou twotwa yo",
|
||||
"Lond se yon gwo vil nan Wayòm Ini",
|
||||
"Kote ou ye?",
|
||||
"Kilès ki prezidan Lafrans?",
|
||||
"Ki kapital Etazini?",
|
||||
"Kile Barack Obama te fèt?",
|
||||
]
|
51
spacy/lang/ht/lemmatizer.py
Normal file
51
spacy/lang/ht/lemmatizer.py
Normal file
|
@ -0,0 +1,51 @@
|
|||
from typing import List, Tuple
|
||||
|
||||
from ...pipeline import Lemmatizer
|
||||
from ...tokens import Token
|
||||
from ...lookups import Lookups
|
||||
|
||||
|
||||
class HaitianCreoleLemmatizer(Lemmatizer):
|
||||
"""
|
||||
Minimal Haitian Creole lemmatizer.
|
||||
Returns a word's base form based on rules and lookup,
|
||||
or defaults to the original form.
|
||||
"""
|
||||
|
||||
def is_base_form(self, token: Token) -> bool:
|
||||
morph = token.morph.to_dict()
|
||||
upos = token.pos_.lower()
|
||||
|
||||
# Consider unmarked forms to be base
|
||||
if upos in {"noun", "verb", "adj", "adv"}:
|
||||
if not morph:
|
||||
return True
|
||||
if upos == "noun" and morph.get("Number") == "Sing":
|
||||
return True
|
||||
if upos == "verb" and morph.get("VerbForm") == "Inf":
|
||||
return True
|
||||
if upos == "adj" and morph.get("Degree") == "Pos":
|
||||
return True
|
||||
return False
|
||||
|
||||
def rule_lemmatize(self, token: Token) -> List[str]:
|
||||
string = token.text.lower()
|
||||
pos = token.pos_.lower()
|
||||
cache_key = (token.orth, token.pos)
|
||||
if cache_key in self.cache:
|
||||
return self.cache[cache_key]
|
||||
|
||||
forms = []
|
||||
|
||||
# fallback rule: just return lowercased form
|
||||
forms.append(string)
|
||||
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
|
||||
@classmethod
|
||||
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||
if mode == "rule":
|
||||
required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
||||
return (required, [])
|
||||
return super().get_lookups_config(mode)
|
78
spacy/lang/ht/lex_attrs.py
Normal file
78
spacy/lang/ht/lex_attrs.py
Normal file
|
@ -0,0 +1,78 @@
|
|||
from ...attrs import LIKE_NUM, NORM
|
||||
|
||||
# Cardinal numbers in Creole
|
||||
_num_words = set(
|
||||
"""
|
||||
zewo youn en de twa kat senk sis sèt uit nèf dis
|
||||
onz douz trèz katoz kenz sèz disèt dizwit diznèf
|
||||
vent trant karant sinkant swasant swasann-dis
|
||||
san mil milyon milya
|
||||
""".split()
|
||||
)
|
||||
|
||||
# Ordinal numbers in Creole (some are French-influenced, some simplified)
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
premye dezyèm twazyèm katryèm senkyèm sizyèm sètvyèm uitvyèm nèvyèm dizyèm
|
||||
onzèm douzyèm trèzyèm katozyèm kenzèm sèzyèm disetyèm dizwityèm diznèvyèm
|
||||
ventyèm trantyèm karantyèm sinkantyèm swasantyèm
|
||||
swasann-disyèm santyèm milyèm milyonnyèm milyadyèm
|
||||
""".split()
|
||||
)
|
||||
|
||||
NORM_MAP = {
|
||||
"'m": "mwen",
|
||||
"'w": "ou",
|
||||
"'l": "li",
|
||||
"'n": "nou",
|
||||
"'y": "yo",
|
||||
"’m": "mwen",
|
||||
"’w": "ou",
|
||||
"’l": "li",
|
||||
"’n": "nou",
|
||||
"’y": "yo",
|
||||
"m": "mwen",
|
||||
"n": "nou",
|
||||
"l": "li",
|
||||
"y": "yo",
|
||||
"w": "ou",
|
||||
"t": "te",
|
||||
"k": "ki",
|
||||
"p": "pa",
|
||||
"M": "Mwen",
|
||||
"N": "Nou",
|
||||
"L": "Li",
|
||||
"Y": "Yo",
|
||||
"W": "Ou",
|
||||
"T": "Te",
|
||||
"K": "Ki",
|
||||
"P": "Pa",
|
||||
}
|
||||
|
||||
def like_num(text):
|
||||
text = text.strip().lower()
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text in _num_words:
|
||||
return True
|
||||
if text in _ordinal_words:
|
||||
return True
|
||||
# Handle things like "3yèm", "10yèm", "25yèm", etc.
|
||||
if text.endswith("yèm") and text[:-3].isdigit():
|
||||
return True
|
||||
return False
|
||||
|
||||
def norm_custom(text):
|
||||
return NORM_MAP.get(text, text.lower())
|
||||
|
||||
LEX_ATTRS = {
|
||||
LIKE_NUM: like_num,
|
||||
NORM: norm_custom,
|
||||
}
|
43
spacy/lang/ht/punctuation.py
Normal file
43
spacy/lang/ht/punctuation.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
from ..char_classes import (
|
||||
ALPHA,
|
||||
ALPHA_LOWER,
|
||||
ALPHA_UPPER,
|
||||
CONCAT_QUOTES,
|
||||
HYPHENS,
|
||||
LIST_PUNCT,
|
||||
LIST_QUOTES,
|
||||
LIST_ELLIPSES,
|
||||
LIST_ICONS,
|
||||
merge_chars,
|
||||
)
|
||||
|
||||
ELISION = "'’".replace(" ", "")
|
||||
|
||||
_prefixes_elision = "m n l y t k w"
|
||||
_prefixes_elision += " " + _prefixes_elision.upper()
|
||||
|
||||
TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
|
||||
r"(?:({pe})[{el}])(?=[{a}])".format(
|
||||
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
|
||||
)
|
||||
]
|
||||
|
||||
TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
|
||||
r"(?<=[0-9])%", # numbers like 10%
|
||||
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
|
||||
r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
|
||||
r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
|
||||
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
|
||||
r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string
|
||||
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
|
||||
]
|
||||
|
||||
TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
|
||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
|
||||
]
|
50
spacy/lang/ht/stop_words.py
Normal file
50
spacy/lang/ht/stop_words.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
a ak an ankò ant apre ap atò avan avanlè
|
||||
byen bò byenke
|
||||
|
||||
chak
|
||||
|
||||
de depi deja deja
|
||||
|
||||
e en epi èske
|
||||
|
||||
fò fòk
|
||||
|
||||
gen genyen
|
||||
|
||||
ki kisa kilès kote koukou konsa konbyen konn konnen kounye kouman
|
||||
|
||||
la l laa le lè li lye lò
|
||||
|
||||
m m' mwen
|
||||
|
||||
nan nap nou n'
|
||||
|
||||
ou oumenm
|
||||
|
||||
pa paske pami pandan pito pou pral preske pwiske
|
||||
|
||||
se selman si sou sòt
|
||||
|
||||
ta tap tankou te toujou tou tan tout toutotan twòp tèl
|
||||
|
||||
w w' wi wè
|
||||
|
||||
y y' yo yon yonn
|
||||
|
||||
non o oh eh
|
||||
|
||||
sa san si swa si
|
||||
|
||||
men mèsi oswa osinon
|
||||
|
||||
"""
|
||||
.split()
|
||||
)
|
||||
|
||||
# Add common contractions, with and without apostrophe variants
|
||||
contractions = ["m'", "n'", "w'", "y'", "l'", "t'", "k'"]
|
||||
for apostrophe in ["'", "’", "‘"]:
|
||||
for word in contractions:
|
||||
STOP_WORDS.add(word.replace("'", apostrophe))
|
74
spacy/lang/ht/syntax_iterators.py
Normal file
74
spacy/lang/ht/syntax_iterators.py
Normal file
|
@ -0,0 +1,74 @@
|
|||
from typing import Iterator, Tuple, Union
|
||||
|
||||
from ...errors import Errors
|
||||
from ...symbols import NOUN, PRON, PROPN
|
||||
from ...tokens import Doc, Span
|
||||
|
||||
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse for Haitian Creole.
|
||||
Works on both Doc and Span objects.
|
||||
"""
|
||||
|
||||
# Core nominal dependencies common in Haitian Creole
|
||||
labels = [
|
||||
"nsubj",
|
||||
"obj",
|
||||
"obl",
|
||||
"nmod",
|
||||
"appos",
|
||||
"ROOT",
|
||||
]
|
||||
|
||||
# Modifiers to optionally include in chunk (to the right)
|
||||
post_modifiers = ["compound", "flat", "flat:name", "fixed"]
|
||||
|
||||
doc = doclike.doc
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
|
||||
np_deps = {doc.vocab.strings.add(label) for label in labels}
|
||||
np_mods = {doc.vocab.strings.add(mod) for mod in post_modifiers}
|
||||
conj_label = doc.vocab.strings.add("conj")
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
adp_pos = doc.vocab.strings.add("ADP")
|
||||
cc_pos = doc.vocab.strings.add("CCONJ")
|
||||
|
||||
prev_end = -1
|
||||
for i, word in enumerate(doclike):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
if word.left_edge.i <= prev_end:
|
||||
continue
|
||||
|
||||
if word.dep in np_deps:
|
||||
right_end = word
|
||||
# expand to include known modifiers to the right
|
||||
for child in word.rights:
|
||||
if child.dep in np_mods:
|
||||
right_end = child.right_edge
|
||||
elif child.pos == NOUN:
|
||||
right_end = child.right_edge
|
||||
|
||||
left_index = word.left_edge.i
|
||||
# Skip prepositions at the start
|
||||
if word.left_edge.pos == adp_pos:
|
||||
left_index += 1
|
||||
|
||||
prev_end = right_end.i
|
||||
yield left_index, right_end.i + 1, np_label
|
||||
|
||||
elif word.dep == conj_label:
|
||||
head = word.head
|
||||
while head.dep == conj_label and head.head.i < head.i:
|
||||
head = head.head
|
||||
if head.dep in np_deps:
|
||||
left_index = word.left_edge.i
|
||||
if word.left_edge.pos == cc_pos:
|
||||
left_index += 1
|
||||
prev_end = word.i
|
||||
yield left_index, word.i + 1, np_label
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
21
spacy/lang/ht/tag_map.py
Normal file
21
spacy/lang/ht/tag_map.py
Normal file
|
@ -0,0 +1,21 @@
|
|||
from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
|
||||
|
||||
TAG_MAP = {
|
||||
"NOUN": {"pos": NOUN},
|
||||
"VERB": {"pos": VERB},
|
||||
"AUX": {"pos": AUX},
|
||||
"ADJ": {"pos": ADJ},
|
||||
"ADV": {"pos": ADV},
|
||||
"PRON": {"pos": PRON},
|
||||
"DET": {"pos": DET},
|
||||
"ADP": {"pos": ADP},
|
||||
"SCONJ": {"pos": SCONJ},
|
||||
"CCONJ": {"pos": CCONJ},
|
||||
"PART": {"pos": PART},
|
||||
"INTJ": {"pos": INTJ},
|
||||
"NUM": {"pos": NUM},
|
||||
"PROPN": {"pos": PROPN},
|
||||
"PUNCT": {"pos": PUNCT},
|
||||
"SYM": {"pos": SYM},
|
||||
"X": {"pos": X},
|
||||
}
|
121
spacy/lang/ht/tokenizer_exceptions.py
Normal file
121
spacy/lang/ht/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,121 @@
|
|||
from spacy.symbols import ORTH, NORM
|
||||
|
||||
def make_variants(base, first_norm, second_orth, second_norm):
|
||||
return {
|
||||
base: [
|
||||
{ORTH: base.split("'")[0] + "'", NORM: first_norm},
|
||||
{ORTH: second_orth, NORM: second_norm},
|
||||
],
|
||||
base.capitalize(): [
|
||||
{ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()},
|
||||
{ORTH: second_orth, NORM: second_norm},
|
||||
]
|
||||
}
|
||||
|
||||
TOKENIZER_EXCEPTIONS = {
|
||||
"Dr.": [{ORTH: "Dr."}]
|
||||
}
|
||||
|
||||
# Apostrophe forms
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("n'ap", "nou", "ap", "ap"))
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("l'ap", "li", "ap", "ap"))
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("y'ap", "yo", "ap", "ap"))
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("m'te", "mwen", "te", "te"))
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("m'pral", "mwen", "pral", "pral"))
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("w'ap", "ou", "ap", "ap"))
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("k'ap", "ki", "ap", "ap"))
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
|
||||
|
||||
# Non-apostrophe contractions (with capitalized variants)
|
||||
TOKENIZER_EXCEPTIONS.update({
|
||||
"map": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Map": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"lem": [
|
||||
{ORTH: "le", NORM: "le"},
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
],
|
||||
"Lem": [
|
||||
{ORTH: "Le", NORM: "Le"},
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
],
|
||||
"lew": [
|
||||
{ORTH: "le", NORM: "le"},
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
],
|
||||
"Lew": [
|
||||
{ORTH: "Le", NORM: "Le"},
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
],
|
||||
"nap": [
|
||||
{ORTH: "n", NORM: "nou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Nap": [
|
||||
{ORTH: "N", NORM: "Nou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"lap": [
|
||||
{ORTH: "l", NORM: "li"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Lap": [
|
||||
{ORTH: "L", NORM: "Li"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"yap": [
|
||||
{ORTH: "y", NORM: "yo"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Yap": [
|
||||
{ORTH: "Y", NORM: "Yo"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"mte": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "te", NORM: "te"},
|
||||
],
|
||||
"Mte": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "te", NORM: "te"},
|
||||
],
|
||||
"mpral": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "pral", NORM: "pral"},
|
||||
],
|
||||
"Mpral": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "pral", NORM: "pral"},
|
||||
],
|
||||
"wap": [
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Wap": [
|
||||
{ORTH: "W", NORM: "Ou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"kap": [
|
||||
{ORTH: "k", NORM: "ki"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Kap": [
|
||||
{ORTH: "K", NORM: "Ki"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"tap": [
|
||||
{ORTH: "t", NORM: "te"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Tap": [
|
||||
{ORTH: "T", NORM: "Te"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
})
|
|
@ -141,7 +141,7 @@ class Language:
|
|||
|
||||
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
||||
object and processing pipeline.
|
||||
lang (str): IETF language code, such as 'en'.
|
||||
lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language codes, such as 'en' and 'eng'.
|
||||
|
||||
DOCS: https://spacy.io/api/language
|
||||
"""
|
||||
|
|
|
@ -212,6 +212,16 @@ def hr_tokenizer():
|
|||
return get_lang_class("hr")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ht_tokenizer():
|
||||
return get_lang_class("ht")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ht_vocab():
|
||||
return get_lang_class("ht")().vocab
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def hu_tokenizer():
|
||||
return get_lang_class("hu")().tokenizer
|
||||
|
|
|
@ -49,7 +49,7 @@ def doc_not_parsed(en_tokenizer):
|
|||
def test_issue1537():
|
||||
"""Test that Span.as_doc() doesn't segfault."""
|
||||
string = "The sky is blue . The man is pink . The dog is purple ."
|
||||
doc = Doc(Vocab(), words=string.split())
|
||||
doc = Doc(Vocab(), words=list(string.split()))
|
||||
doc[0].sent_start = True
|
||||
for word in doc[1:]:
|
||||
if word.nbor(-1).text == ".":
|
||||
|
@ -225,6 +225,21 @@ def test_spans_span_sent(doc, doc_not_parsed):
|
|||
assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
|
||||
|
||||
|
||||
def test_issue13769():
|
||||
# Test issue 13769: Incorrect output of span.sents when final token is a sentence outside of the span.
|
||||
doc = Doc(
|
||||
Vocab(),
|
||||
words=list("This is a sentence . This is another sentence . Third".split()),
|
||||
)
|
||||
doc[0].is_sent_start = True
|
||||
doc[5].is_sent_start = True
|
||||
doc[10].is_sent_start = True
|
||||
doc.ents = [("ENTITY", 7, 9)] # "another sentence" phrase in the second sentence
|
||||
entity = doc.ents[0]
|
||||
ent_sents = list(entity.sents)
|
||||
assert len(ent_sents) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"start,end,expected_sentence",
|
||||
[
|
||||
|
|
0
spacy/tests/lang/ht/__init__.py
Normal file
0
spacy/tests/lang/ht/__init__.py
Normal file
32
spacy/tests/lang/ht/test_exceptions.py
Normal file
32
spacy/tests/lang/ht/test_exceptions.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
import pytest
|
||||
|
||||
|
||||
def test_ht_tokenizer_handles_basic_contraction(ht_tokenizer):
|
||||
text = "m'ap ri"
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == "m'"
|
||||
assert tokens[1].text == "ap"
|
||||
assert tokens[2].text == "ri"
|
||||
|
||||
text = "mwen di'w non!"
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 5
|
||||
assert tokens[0].text == "mwen"
|
||||
assert tokens[1].text == "di"
|
||||
assert tokens[2].text == "'w"
|
||||
assert tokens[3].text == "non"
|
||||
assert tokens[4].text == "!"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["Dr."])
|
||||
def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].text == text
|
||||
|
||||
|
||||
def test_ht_tokenizer_full_sentence(ht_tokenizer):
|
||||
text = "Si'm ka vini, m'ap pale ak li."
|
||||
tokens = [t.text for t in ht_tokenizer(text)]
|
||||
assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."]
|
44
spacy/tests/lang/ht/test_noun_chunks.py
Normal file
44
spacy/tests/lang/ht/test_noun_chunks.py
Normal file
|
@ -0,0 +1,44 @@
|
|||
import pytest
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def doc(ht_vocab):
|
||||
words = ["Pitit", "gen", "gwo", "pwoblèm", "ak", "kontwòl"]
|
||||
heads = [1, 1, 5, 5, 3, 3]
|
||||
deps = ["nsubj", "ROOT", "amod", "obj", "case", "nmod"]
|
||||
pos = ["NOUN", "VERB", "ADJ", "NOUN", "ADP", "NOUN"]
|
||||
return Doc(ht_vocab, words=words, heads=heads, deps=deps, pos=pos)
|
||||
|
||||
|
||||
def test_noun_chunks_is_parsed(ht_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'ht' language if Doc is not parsed."""
|
||||
doc = ht_tokenizer("Sa a se yon fraz")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
||||
|
||||
def test_ht_noun_chunks_not_nested(doc, ht_vocab):
|
||||
"""Test that each token only appears in one noun chunk at most"""
|
||||
word_occurred = {}
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) > 1
|
||||
for chunk in chunks:
|
||||
for word in chunk:
|
||||
word_occurred.setdefault(word.text, 0)
|
||||
word_occurred[word.text] += 1
|
||||
assert len(word_occurred) > 0
|
||||
for word, freq in word_occurred.items():
|
||||
assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks])
|
||||
|
||||
|
||||
def test_noun_chunks_span(doc, ht_tokenizer):
|
||||
"""Test that the span.noun_chunks property works correctly"""
|
||||
doc_chunks = list(doc.noun_chunks)
|
||||
span = doc[0:3]
|
||||
span_chunks = list(span.noun_chunks)
|
||||
assert 0 < len(span_chunks) < len(doc_chunks)
|
||||
for chunk in span_chunks:
|
||||
assert chunk in doc_chunks
|
||||
assert chunk.start >= 0
|
||||
assert chunk.end <= 3
|
130
spacy/tests/lang/ht/test_prefix_suffix_infix.py
Normal file
130
spacy/tests/lang/ht/test_prefix_suffix_infix.py
Normal file
|
@ -0,0 +1,130 @@
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["(ka)"])
|
||||
def test_ht_tokenizer_splits_no_special(ht_tokenizer, text):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["m'ap"])
|
||||
def test_ht_tokenizer_splits_no_punct(ht_tokenizer, text):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["(m'ap"])
|
||||
def test_ht_tokenizer_splits_prefix_punct(ht_tokenizer, text):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["m'ap)"])
|
||||
def test_ht_tokenizer_splits_suffix_punct(ht_tokenizer, text):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["(m'ap)"])
|
||||
def test_ht_tokenizer_splits_even_wrap(ht_tokenizer, text):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["(m'ap?)"])
|
||||
def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 5
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)])
|
||||
def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["Ozetazini.)"])
|
||||
def test_ht_tokenizer_splits_suffix_interact(ht_tokenizer, text):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["(Ozetazini.)"])
|
||||
def test_ht_tokenizer_splits_even_wrap_interact(ht_tokenizer, text):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["(Ozetazini?)"])
|
||||
def test_ht_tokenizer_splits_uneven_wrap_interact(ht_tokenizer, text):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["pi-bon"])
|
||||
def test_ht_tokenizer_splits_hyphens(ht_tokenizer, text):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
||||
def test_ht_tokenizer_splits_numeric_range(ht_tokenizer, text):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["pi.Bon", "Bon.Jour"])
|
||||
def test_ht_tokenizer_splits_period_infix(ht_tokenizer, text):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["Bonjou,moun", "youn,de"])
|
||||
def test_ht_tokenizer_splits_comma_infix(ht_tokenizer, text):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == text.split(",")[0]
|
||||
assert tokens[1].text == ","
|
||||
assert tokens[2].text == text.split(",")[1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["pi...Bon", "pi...bon"])
|
||||
def test_ht_tokenizer_splits_ellipsis_infix(ht_tokenizer, text):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_ht_tokenizer_splits_double_hyphen_infix(ht_tokenizer):
|
||||
tokens = ht_tokenizer("Pa vrè--men ou konnen--mwen renmen w.")
|
||||
assert tokens[0].text == "Pa"
|
||||
assert tokens[1].text == "vrè"
|
||||
assert tokens[2].text == "--"
|
||||
assert tokens[3].text == "men"
|
||||
assert tokens[4].text == "ou"
|
||||
assert tokens[5].text == "konnen"
|
||||
assert tokens[6].text == "--"
|
||||
assert tokens[7].text == "mwen"
|
||||
assert tokens[8].text == "renmen"
|
||||
assert tokens[9].text == "w"
|
||||
assert tokens[10].text == "."
|
||||
|
||||
|
||||
def test_ht_tokenizer_splits_period_abbr(ht_tokenizer):
|
||||
text = "Jodi a se Madi.Mr."
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 7
|
||||
assert tokens[0].text == "Jodi"
|
||||
assert tokens[1].text == "a"
|
||||
assert tokens[2].text == "se"
|
||||
assert tokens[3].text == "Madi"
|
||||
assert tokens[4].text == "."
|
||||
assert tokens[5].text == "Mr"
|
||||
assert tokens[6].text == "."
|
||||
|
||||
|
||||
def test_ht_tokenizer_splits_paren_period(ht_tokenizer):
|
||||
tokens = ht_tokenizer("M ap teste sa (pou kounye a).")
|
||||
words = [t.text for t in tokens]
|
||||
assert "a" in words
|
||||
assert ")" in words
|
||||
assert "." in words
|
79
spacy/tests/lang/ht/test_text.py
Normal file
79
spacy/tests/lang/ht/test_text.py
Normal file
|
@ -0,0 +1,79 @@
|
|||
import pytest
|
||||
|
||||
from spacy.lang.ht.lex_attrs import like_num, norm_custom
|
||||
|
||||
|
||||
def test_ht_tokenizer_handles_long_text(ht_tokenizer):
|
||||
text = """Onè ap fèt pou ansyen lidè Pati Travayè Britanik
|
||||
|
||||
Moun atravè lemond ap voye onè pou ansyen lidè
|
||||
Pati Travayè a, John Smith, ki mouri pi bonè jodi a apre li te fè yon gwo kriz kadyak a laj 55 an.
|
||||
|
||||
Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre lanmò twò bonè" avoka ak palmantè eskoze a.
|
||||
|
||||
"Misye Smith, pandan tout karyè li ki te make ak distenksyon"""
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 84
|
||||
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,length",
|
||||
[
|
||||
("Map manje gato a pandan map gade televizyon lem lakay mwen.", 15),
|
||||
("M'ap vini, eske wap la avek lajan'm? Si oui, di'l non pou fre'w.", 22),
|
||||
("M ap teste sa (pou kounye a).", 10),
|
||||
],
|
||||
)
|
||||
def test_ht_tokenizer_handles_cnts(ht_tokenizer, text, length):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,match",
|
||||
[
|
||||
("10", True),
|
||||
("1", True),
|
||||
("10,000", True),
|
||||
("10,00", True),
|
||||
("999.0", True),
|
||||
("en", True),
|
||||
("de", True),
|
||||
("milya", True),
|
||||
("dog", False),
|
||||
(",", False),
|
||||
("1/2", True),
|
||||
],
|
||||
)
|
||||
def test_lex_attrs_like_number(ht_tokenizer, text, match):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].like_num == match
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word", ["ventyèm", "Milyonnyèm", "3yèm", "Santyèm", "25yèm", "52yèm"]
|
||||
)
|
||||
def test_ht_lex_attrs_like_number_for_ordinal(word):
|
||||
assert like_num(word)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("word", ["onz"])
|
||||
def test_ht_lex_attrs_capitals(word):
|
||||
assert like_num(word)
|
||||
assert like_num(word.upper())
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word, expected", [
|
||||
("'m", "mwen"),
|
||||
("'n", "nou"),
|
||||
("'l", "li"),
|
||||
("'y", "yo"),
|
||||
("'w", "ou"),
|
||||
]
|
||||
)
|
||||
def test_ht_lex_attrs_norm_custom(word, expected):
|
||||
assert norm_custom(word) == expected
|
||||
|
|
@ -656,17 +656,12 @@ def test_spacy_blank():
|
|||
@pytest.mark.parametrize(
|
||||
"lang,target",
|
||||
[
|
||||
("en", "en"),
|
||||
("fra", "fr"),
|
||||
("fre", "fr"),
|
||||
("iw", "he"),
|
||||
("mo", "ro"),
|
||||
("scc", "sr"),
|
||||
("mul", "xx"),
|
||||
("no", "nb"),
|
||||
("pt-BR", "pt"),
|
||||
("xx", "xx"),
|
||||
("zh-Hans", "zh"),
|
||||
("zh-Hant", None),
|
||||
("zxx", None),
|
||||
],
|
||||
)
|
||||
|
@ -686,11 +681,9 @@ def test_language_matching(lang, target):
|
|||
("fre", "fr"),
|
||||
("iw", "he"),
|
||||
("mo", "ro"),
|
||||
("scc", "sr"),
|
||||
("mul", "xx"),
|
||||
("no", "nb"),
|
||||
("pt-BR", "pt"),
|
||||
("xx", "xx"),
|
||||
("zh-Hans", "zh"),
|
||||
],
|
||||
)
|
||||
def test_blank_languages(lang, target):
|
||||
|
|
|
@ -479,10 +479,11 @@ cdef class Span:
|
|||
break
|
||||
elif i == self.doc.length - 1:
|
||||
yield Span(self.doc, start, self.doc.length)
|
||||
|
||||
# Ensure that trailing parts of the Span instance are included in last element of .sents.
|
||||
if start == self.doc.length - 1:
|
||||
yield Span(self.doc, start, self.doc.length)
|
||||
else:
|
||||
# Ensure that trailing parts of the Span instance are included in last element of .sents.
|
||||
# We only want to do this if we didn't break above
|
||||
if start == self.doc.length - 1:
|
||||
yield Span(self.doc, start, self.doc.length)
|
||||
|
||||
@property
|
||||
def ents(self):
|
||||
|
|
143
spacy/util.py
143
spacy/util.py
|
@ -5,7 +5,6 @@ import inspect
|
|||
import itertools
|
||||
import logging
|
||||
import os
|
||||
import pkgutil
|
||||
import re
|
||||
import shlex
|
||||
import shutil
|
||||
|
@ -40,7 +39,6 @@ from typing import (
|
|||
)
|
||||
|
||||
import catalogue
|
||||
import langcodes
|
||||
import numpy
|
||||
import srsly
|
||||
import thinc
|
||||
|
@ -89,6 +87,83 @@ LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt"
|
|||
# Default order of sections in the config file. Not all sections needs to exist,
|
||||
# and additional sections are added at the end, in alphabetical order.
|
||||
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
|
||||
|
||||
LANG_ALIASES = {
|
||||
"af": ["afr"],
|
||||
"am": ["amh"],
|
||||
"ar": ["ara"],
|
||||
"az": ["aze"],
|
||||
"bg": ["bul"],
|
||||
"bn": ["ben"],
|
||||
"bo": ["bod", "tib"],
|
||||
"ca": ["cat"],
|
||||
"cs": ["ces", "cze"],
|
||||
"da": ["dan"],
|
||||
"de": ["deu", "ger"],
|
||||
"el": ["ell", "gre"],
|
||||
"en": ["eng"],
|
||||
"es": ["spa"],
|
||||
"et": ["est"],
|
||||
"eu": ["eus", "baq"],
|
||||
"fa": ["fas", "per"],
|
||||
"fi": ["fin"],
|
||||
"fo": ["fao"],
|
||||
"fr": ["fra", "fre"],
|
||||
"ga": ["gle"],
|
||||
"gd": ["gla"],
|
||||
"gu": ["guj"],
|
||||
"he": ["heb", "iw"], # "iw" is the obsolete ISO 639-1 code for Hebrew
|
||||
"hi": ["hin"],
|
||||
"hr": ["hrv", "scr"], # "scr" is the deprecated ISO 639-2/B for Croatian
|
||||
"hu": ["hun"],
|
||||
"hy": ["hye"],
|
||||
"id": ["ind", "in"], # "in" is the obsolete ISO 639-1 code for Hebrew
|
||||
"is": ["isl", "ice"],
|
||||
"it": ["ita"],
|
||||
"ja": ["jpn"],
|
||||
"kn": ["kan"],
|
||||
"ko": ["kor"],
|
||||
"ky": ["kir"],
|
||||
"la": ["lat"],
|
||||
"lb": ["ltz"],
|
||||
"lg": ["lug"],
|
||||
"lt": ["lit"],
|
||||
"lv": ["lav"],
|
||||
"mk": ["mkd", "mac"],
|
||||
"ml": ["mal"],
|
||||
"mr": ["mar"],
|
||||
"ms": ["msa", "may"],
|
||||
"nb": ["nob"],
|
||||
"ne": ["nep"],
|
||||
"nl": ["nld", "dut"],
|
||||
"nn": ["nno"],
|
||||
"pl": ["pol"],
|
||||
"pt": ["por"],
|
||||
"ro": ["ron", "rom", "mo", "mol"], # "mo" and "mol" are deprecated codes for Moldavian
|
||||
"ru": ["rus"],
|
||||
"sa": ["san"],
|
||||
"si": ["sin"],
|
||||
"sk": ["slk", "slo"],
|
||||
"sl": ["slv"],
|
||||
"sq": ["sqi", "alb"],
|
||||
"sr": ["srp", "scc"], # "scc" is the deprecated ISO 639-2/B code for Serbian
|
||||
"sv": ["swe"],
|
||||
"ta": ["tam"],
|
||||
"te": ["tel"],
|
||||
"th": ["tha"],
|
||||
"ti": ["tir"],
|
||||
"tl": ["tgl"],
|
||||
"tn": ["tsn"],
|
||||
"tr": ["tur"],
|
||||
"tt": ["tat"],
|
||||
"uk": ["ukr"],
|
||||
"ur": ["urd"],
|
||||
"vi": ["viw"],
|
||||
"yo": ["yor"],
|
||||
"zh": ["zho", "chi"],
|
||||
|
||||
"xx": ["mul"],
|
||||
}
|
||||
# fmt: on
|
||||
|
||||
logger = logging.getLogger("spacy")
|
||||
|
@ -305,63 +380,39 @@ def lang_class_is_loaded(lang: str) -> bool:
|
|||
|
||||
def find_matching_language(lang: str) -> Optional[str]:
|
||||
"""
|
||||
Given an IETF language code, find a supported spaCy language that is a
|
||||
close match for it (according to Unicode CLDR language-matching rules).
|
||||
This allows for language aliases, ISO 639-2 codes, more detailed language
|
||||
tags, and close matches.
|
||||
Given a two-letter ISO 639-1 or three-letter ISO 639-3 language code,
|
||||
find a supported spaCy language.
|
||||
|
||||
Returns the language code if a matching language is available, or None
|
||||
if there is no matching language.
|
||||
|
||||
>>> find_matching_language('en')
|
||||
'en'
|
||||
>>> find_matching_language('pt-BR') # Brazilian Portuguese
|
||||
'pt'
|
||||
>>> find_matching_language('fra') # an ISO 639-2 code for French
|
||||
>>> find_matching_language('fra') # ISO 639-3 code for French
|
||||
'fr'
|
||||
>>> find_matching_language('iw') # obsolete alias for Hebrew
|
||||
>>> find_matching_language('fre') # ISO 639-2/B code for French
|
||||
'fr'
|
||||
>>> find_matching_language('iw') # Obsolete ISO 639-1 code for Hebrew
|
||||
'he'
|
||||
>>> find_matching_language('no') # Norwegian
|
||||
'nb'
|
||||
>>> find_matching_language('mo') # old code for ro-MD
|
||||
>>> find_matching_language('mo') # Deprecated code for Moldavian
|
||||
'ro'
|
||||
>>> find_matching_language('zh-Hans') # Simplified Chinese
|
||||
'zh'
|
||||
>>> find_matching_language('scc') # Deprecated ISO 639-2/B code for Serbian
|
||||
'sr'
|
||||
>>> find_matching_language('zxx')
|
||||
None
|
||||
"""
|
||||
import spacy.lang # noqa: F401
|
||||
|
||||
if lang == "xx":
|
||||
return "xx"
|
||||
# Check aliases
|
||||
for lang_code, aliases in LANG_ALIASES.items():
|
||||
if lang in aliases:
|
||||
return lang_code
|
||||
|
||||
# Find out which language modules we have
|
||||
possible_languages = []
|
||||
for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore[attr-defined]
|
||||
code = modinfo.name
|
||||
if code == "xx":
|
||||
# Temporarily make 'xx' into a valid language code
|
||||
possible_languages.append("mul")
|
||||
elif langcodes.tag_is_valid(code):
|
||||
possible_languages.append(code)
|
||||
|
||||
# Distances from 1-9 allow near misses like Bosnian -> Croatian and
|
||||
# Norwegian -> Norwegian Bokmål. A distance of 10 would include several
|
||||
# more possibilities, like variants of Chinese like 'wuu', but text that
|
||||
# is labeled that way is probably trying to be distinct from 'zh' and
|
||||
# shouldn't automatically match.
|
||||
match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
|
||||
if match == "mul":
|
||||
# Convert 'mul' back to spaCy's 'xx'
|
||||
return "xx"
|
||||
else:
|
||||
return match
|
||||
return None
|
||||
|
||||
|
||||
def get_lang_class(lang: str) -> Type["Language"]:
|
||||
"""Import and load a Language class.
|
||||
|
||||
lang (str): IETF language code, such as 'en'.
|
||||
lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language code, such as 'en' and 'eng'.
|
||||
RETURNS (Language): Language class.
|
||||
"""
|
||||
# Check if language is registered / entry point is available
|
||||
|
@ -372,13 +423,9 @@ def get_lang_class(lang: str) -> Type["Language"]:
|
|||
try:
|
||||
module = importlib.import_module(f".lang.{lang}", "spacy")
|
||||
except ImportError as err:
|
||||
# Find a matching language. For example, if the language 'no' is
|
||||
# requested, we can use language-matching to load `spacy.lang.nb`.
|
||||
try:
|
||||
match = find_matching_language(lang)
|
||||
except langcodes.tag_parser.LanguageTagError:
|
||||
# proceed to raising an import error
|
||||
match = None
|
||||
# Find a matching language. For example, if the language 'eng' is
|
||||
# requested, we can use language-matching to load `spacy.lang.en`.
|
||||
match = find_matching_language(lang)
|
||||
|
||||
if match:
|
||||
lang = match
|
||||
|
|
|
@ -230,7 +230,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
|
|||
|
||||
| Name | Description |
|
||||
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~ |
|
||||
| `lang` | Pipeline language. Two-letter [ISO 639-1 code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3 code](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes), such as `en` and `eng`. ~~str (positional)~~ |
|
||||
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
|
||||
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
||||
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
||||
|
|
|
@ -1078,7 +1078,7 @@ details.
|
|||
| Name | Description |
|
||||
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ |
|
||||
| `lang` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~ |
|
||||
| `lang` | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng' for English. ~~str~~ |
|
||||
| `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ |
|
||||
|
||||
## Defaults {id="defaults"}
|
||||
|
|
|
@ -561,7 +561,7 @@ overlaps with will be returned.
|
|||
| `orth_` | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. ~~str~~ |
|
||||
| `label` | The hash value of the span's label. ~~int~~ |
|
||||
| `label_` | The span's label. ~~str~~ |
|
||||
| `lemma_` | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~ |
|
||||
| `lemma_` | The span's lemma. Equivalent to `"".join(token.lemma_ + token.whitespace_ for token in span).strip()`. ~~str~~ |
|
||||
| `kb_id` | The hash value of the knowledge base ID referred to by the span. ~~int~~ |
|
||||
| `kb_id_` | The knowledge base ID referred to by the span. ~~str~~ |
|
||||
| `ent_id` | The hash value of the named entity the root token is an instance of. ~~int~~ |
|
||||
|
|
|
@ -86,7 +86,7 @@ Create a blank pipeline of a given language class. This function is the twin of
|
|||
|
||||
| Name | Description |
|
||||
| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `name` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~ |
|
||||
| `name` | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng', of the language class to load. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
|
|
|
@ -51,7 +51,7 @@ modified later.
|
|||
| `strings` | The string store. A new string store is created if one is not provided. Defaults to `None`. ~~Optional[StringStore]~~ |
|
||||
| `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ |
|
||||
| `data` | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~ |
|
||||
| `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ |
|
||||
| `keys` | An iterable of keys aligned with the data. ~~Iterable[Union[str, int]]~~ |
|
||||
| `name` | A name to identify the vectors table. ~~str~~ |
|
||||
| `mode` <Tag variant="new">3.2</Tag> | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~ |
|
||||
| `minn` <Tag variant="new">3.2</Tag> | The floret char ngram minn (default: `0`). ~~int~~ |
|
||||
|
|
|
@ -283,7 +283,7 @@ Serialize the current state to a binary string.
|
|||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The serialized form of the `Vocab` object. ~~Vocab~~ |
|
||||
| **RETURNS** | The serialized form of the `Vocab` object. ~~bytes~~ |
|
||||
|
||||
## Vocab.from_bytes {id="from_bytes",tag="method"}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user