Reformat with black 25

This commit is contained in:
Matthew Honnibal 2025-11-04 15:19:09 +01:00
parent 68679d6f85
commit 54f54fc4cc
93 changed files with 277 additions and 285 deletions

View File

@ -225,13 +225,11 @@ def get_git_version(
@overload @overload
def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]: def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]: ...
...
@overload @overload
def string_to_list(value: str, intify: Literal[True]) -> List[int]: def string_to_list(value: str, intify: Literal[True]) -> List[int]: ...
...
def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]: def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]:

View File

@ -968,16 +968,14 @@ def _compile_gold(
@overload @overload
def _format_labels(labels: Iterable[str], counts: Literal[False] = False) -> str: def _format_labels(labels: Iterable[str], counts: Literal[False] = False) -> str: ...
...
@overload @overload
def _format_labels( def _format_labels(
labels: Iterable[Tuple[str, int]], labels: Iterable[Tuple[str, int]],
counts: Literal[True], counts: Literal[True],
) -> str: ) -> str: ...
...
def _format_labels( def _format_labels(

View File

@ -157,9 +157,11 @@ def find_threshold(
exits=1, exits=1,
) )
return { return {
keys[0]: filter_config(config[keys[0]], keys[1:], full_key) keys[0]: (
if len(keys) > 1 filter_config(config[keys[0]], keys[1:], full_key)
else config[keys[0]] if len(keys) > 1
else config[keys[0]]
)
} }
# Evaluate with varying threshold values. # Evaluate with varying threshold values.
@ -216,12 +218,14 @@ def find_threshold(
if len(set(scores.values())) == 1: if len(set(scores.values())) == 1:
wasabi.msg.warn( wasabi.msg.warn(
title="All scores are identical. Verify that all settings are correct.", title="All scores are identical. Verify that all settings are correct.",
text="" text=(
if ( ""
not isinstance(pipe, MultiLabel_TextCategorizer) if (
or scores_key in ("cats_macro_f", "cats_micro_f") not isinstance(pipe, MultiLabel_TextCategorizer)
) or scores_key in ("cats_macro_f", "cats_micro_f")
else "Use `cats_macro_f` or `cats_micro_f` when optimizing the threshold for `textcat_multilabel`.", )
else "Use `cats_macro_f` or `cats_micro_f` when optimizing the threshold for `textcat_multilabel`."
),
) )
else: else:

View File

@ -195,9 +195,11 @@ def init_config(
"Pipeline": ", ".join(pipeline), "Pipeline": ", ".join(pipeline),
"Optimize for": optimize, "Optimize for": optimize,
"Hardware": variables["hardware"].upper(), "Hardware": variables["hardware"].upper(),
"Transformer": template_vars.transformer.get("name") # type: ignore[attr-defined] "Transformer": (
if template_vars.use_transformer # type: ignore[attr-defined] template_vars.transformer.get("name") # type: ignore[attr-defined]
else None, if template_vars.use_transformer # type: ignore[attr-defined]
else None
),
} }
msg.info("Generated config template specific for your use case") msg.info("Generated config template specific for your use case")
for label, value in use_case.items(): for label, value in use_case.items():

View File

@ -1,4 +1,5 @@
"""Helpers for Python and platform compatibility.""" """Helpers for Python and platform compatibility."""
import sys import sys
from thinc.util import copy_array from thinc.util import copy_array

View File

@ -4,6 +4,7 @@ spaCy's built in visualization suite for dependencies and named entities.
DOCS: https://spacy.io/api/top-level#displacy DOCS: https://spacy.io/api/top-level#displacy
USAGE: https://spacy.io/usage/visualizers USAGE: https://spacy.io/usage/visualizers
""" """
import warnings import warnings
from typing import Any, Callable, Dict, Iterable, Optional, Union from typing import Any, Callable, Dict, Iterable, Optional, Union

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"አፕል የዩኬን ጅምር ድርጅት በ 1 ቢሊዮን ዶላር ለመግዛት አስቧል።", "አፕል የዩኬን ጅምር ድርጅት በ 1 ቢሊዮን ዶላር ለመግዛት አስቧል።",
"የራስ ገዝ መኪኖች የኢንሹራንስ ኃላፊነትን ወደ አምራቾች ያዛውራሉ", "የራስ ገዝ መኪኖች የኢንሹራንስ ኃላፊነትን ወደ አምራቾች ያዛውራሉ",

View File

@ -4,7 +4,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Bu bir cümlədir.", "Bu bir cümlədir.",
"Necəsən?", "Necəsən?",

View File

@ -3,6 +3,7 @@ References:
https://github.com/Alir3z4/stop-words - Original list, serves as a base. https://github.com/Alir3z4/stop-words - Original list, serves as a base.
https://postvai.com/books/stop-dumi.pdf - Additions to the original list in order to improve it. https://postvai.com/books/stop-dumi.pdf - Additions to the original list in order to improve it.
""" """
STOP_WORDS = set( STOP_WORDS = set(
""" """
а автентичен аз ако ала а автентичен аз ако ала

View File

@ -5,5 +5,4 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = ["তুই খুব ভালো", "আজ আমরা ডাক্তার দেখতে যাবো", "আমি জানি না "] sentences = ["তুই খুব ভালো", "আজ আমরা ডাক্তার দেখতে যাবো", "আমি জানি না "]

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།", "དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།",
"ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག", "ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Apple està buscant comprar una startup del Regne Unit per mil milions de dòlars", "Apple està buscant comprar una startup del Regne Unit per mil milions de dòlars",
"Els cotxes autònoms deleguen la responsabilitat de l'assegurança als seus fabricants", "Els cotxes autònoms deleguen la responsabilitat de l'assegurança als seus fabricants",

View File

@ -277,10 +277,10 @@ _currency = (
# These expressions contain various unicode variations, including characters # These expressions contain various unicode variations, including characters
# used in Chinese (see #1333, #1340, #1351) unless there are cross-language # used in Chinese (see #1333, #1340, #1351) unless there are cross-language
# conflicts, spaCy's base tokenizer should handle all of those by default # conflicts, spaCy's base tokenizer should handle all of those by default
_punct = ( _punct = r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 · । ، ۔ ؛ ٪"
r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 · । ، ۔ ؛ ٪" _quotes = (
r'\' " ” “ ` ´ , „ » « 「 」 『 』 【 】 《 》 〈 〉 〈 〉 ⟦ ⟧'
) )
_quotes = r'\' " ” “ ` ´ , „ » « 「 」 『 』 【 】 《 》 〈 〉 〈 〉 ⟦ ⟧'
_hyphens = "- — -- --- —— ~" _hyphens = "- — -- --- —— ~"
# Various symbols like dingbats, but also emoji # Various symbols like dingbats, but also emoji

View File

@ -4,7 +4,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Máma mele maso.", "Máma mele maso.",
"Příliš žluťoučký kůň úpěl ďábelské ódy.", "Příliš žluťoučký kůň úpěl ďábelské ódy.",

View File

@ -2,6 +2,7 @@
Tokenizer Exceptions. Tokenizer Exceptions.
Source: https://forkortelse.dk/ and various others. Source: https://forkortelse.dk/ and various others.
""" """
from ...symbols import NORM, ORTH from ...symbols import NORM, ORTH
from ...util import update_exc from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen", "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
"Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz", "Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.", "Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.",
"Mi so tu jara derje spodoba.", "Mi so tu jara derje spodoba.",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Apple is looking at buying U.K. startup for $1 billion", "Apple is looking at buying U.K. startup for $1 billion",
"Autonomous cars shift insurance liability toward manufacturers", "Autonomous cars shift insurance liability toward manufacturers",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Apple está buscando comprar una startup del Reino Unido por mil millones de dólares.", "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares.",
"Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes.", "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes.",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"این یک جمله نمونه می باشد.", "این یک جمله نمونه می باشد.",
"قرار ما، امروز ساعت ۲:۳۰ بعدازظهر هست!", "قرار ما، امروز ساعت ۲:۳۰ بعدازظهر هست!",

View File

@ -100,9 +100,9 @@ conj_contraction_negations = [
("eivat", "eivät"), ("eivat", "eivät"),
("eivät", "eivät"), ("eivät", "eivät"),
] ]
for (base_lower, base_norm) in conj_contraction_bases: for base_lower, base_norm in conj_contraction_bases:
for base in [base_lower, base_lower.title()]: for base in [base_lower, base_lower.title()]:
for (suffix, suffix_norm) in conj_contraction_negations: for suffix, suffix_norm in conj_contraction_negations:
_exc[base + suffix] = [ _exc[base + suffix] = [
{ORTH: base, NORM: base_norm}, {ORTH: base, NORM: base_norm},
{ORTH: suffix, NORM: suffix_norm}, {ORTH: suffix, NORM: suffix_norm},

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Apple cherche à acheter une start-up anglaise pour 1 milliard de dollars", "Apple cherche à acheter une start-up anglaise pour 1 milliard de dollars",
"Les voitures autonomes déplacent la responsabilité de l'assurance vers les constructeurs", "Les voitures autonomes déplacent la responsabilité de l'assurance vers les constructeurs",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·", "ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·",
"εὐδαίμων Χαρίτων καὶ Μελάνιππος ἔφυ, θείας ἁγητῆρες ἐφαμερίοις φιλότατος.", "εὐδαίμων Χαρίτων καὶ Μελάνιππος ἔφυ, θείας ἁγητῆρες ἐφαμερίοις φιλότατος.",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"લોકશાહી એ સરકારનું એક એવું તંત્ર છે જ્યાં નાગરિકો મત દ્વારા સત્તાનો ઉપયોગ કરે છે.", "લોકશાહી એ સરકારનું એક એવું તંત્ર છે જ્યાં નાગરિકો મત દ્વારા સત્તાનો ઉપયોગ કરે છે.",
"તે ગુજરાત રાજ્યના ધરમપુર શહેરમાં આવેલું હતું", "તે ગુજરાત રાજ્યના ધરમપુર શહેરમાં આવેલું હતું",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל", "סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל",
'רה"מ הודיע כי יחרים טקס בחסותו', 'רה"מ הודיע כי יחרים טקס בחסותו',

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"एप्पल 1 अरब डॉलर के लिए यू.के. स्टार्टअप खरीदने पर विचार कर रहा है।", "एप्पल 1 अरब डॉलर के लिए यू.के. स्टार्टअप खरीदने पर विचार कर रहा है।",
"स्वायत्त कारें निर्माताओं की ओर बीमा दायित्व रखतीं हैं।", "स्वायत्त कारें निर्माताओं की ओर बीमा दायित्व रखतीं हैं।",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"To běšo wjelgin raźone a jo se wót luźi derje pśiwzeło. Tak som dožywiła wjelgin", "To běšo wjelgin raźone a jo se wót luźi derje pśiwzeło. Tak som dožywiła wjelgin",
"Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.", "Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.",

View File

@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
tag_map = TAG_MAP tag_map = TAG_MAP
class HaitianCreole(Language): class HaitianCreole(Language):
lang = "ht" lang = "ht"
Defaults = HaitianCreoleDefaults Defaults = HaitianCreoleDefaults
@HaitianCreole.factory( @HaitianCreole.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
@ -49,4 +51,5 @@ def make_lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
) )
__all__ = ["HaitianCreole"] __all__ = ["HaitianCreole"]

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Apple ap panse achte yon demaraj nan Wayòm Ini pou $1 milya dola", "Apple ap panse achte yon demaraj nan Wayòm Ini pou $1 milya dola",
"Machin otonòm fè responsablite asirans lan ale sou men fabrikan yo", "Machin otonòm fè responsablite asirans lan ale sou men fabrikan yo",

View File

@ -49,6 +49,7 @@ NORM_MAP = {
"P": "Pa", "P": "Pa",
} }
def like_num(text): def like_num(text):
text = text.strip().lower() text = text.strip().lower()
if text.startswith(("+", "-", "±", "~")): if text.startswith(("+", "-", "±", "~")):
@ -69,9 +70,11 @@ def like_num(text):
return True return True
return False return False
def norm_custom(text): def norm_custom(text):
return NORM_MAP.get(text, text.lower()) return NORM_MAP.get(text, text.lower())
LEX_ATTRS = { LEX_ATTRS = {
LIKE_NUM: like_num, LIKE_NUM: like_num,
NORM: norm_custom, NORM: norm_custom,

View File

@ -16,28 +16,43 @@ ELISION = "'".replace(" ", "")
_prefixes_elision = "m n l y t k w" _prefixes_elision = "m n l y t k w"
_prefixes_elision += " " + _prefixes_elision.upper() _prefixes_elision += " " + _prefixes_elision.upper()
TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [ TOKENIZER_PREFIXES = (
r"(?:({pe})[{el}])(?=[{a}])".format( LIST_PUNCT
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision) + LIST_QUOTES
) + [
] r"(?:({pe})[{el}])(?=[{a}])".format(
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
)
]
)
TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [ TOKENIZER_SUFFIXES = (
r"(?<=[0-9])%", # numbers like 10% LIST_PUNCT
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers + LIST_QUOTES
r"(?<=[{a}])[']".format(a=ALPHA), # apostrophes after letters + LIST_ELLIPSES
r"(?<=[{a}])['][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions + [
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number r"(?<=[0-9])%", # numbers like 10%
r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis r"(?<=[{a}])[']".format(a=ALPHA), # apostrophes after letters
] r"(?<=[{a}])['][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
r"(?<=[{a}])\.(?=\s|$)".format(
a=ALPHA
), # period after letter if space or end of string
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
]
)
TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [ TOKENIZER_INFIXES = (
r"(?<=[0-9])[+\-\*^](?=[0-9-])", LIST_ELLIPSES
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + LIST_ICONS
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + [
), r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), ),
] r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
]
)

View File

@ -39,8 +39,7 @@ sa san si swa si
men mèsi oswa osinon men mèsi oswa osinon
""" """.split()
.split()
) )
# Add common contractions, with and without apostrophe variants # Add common contractions, with and without apostrophe variants

View File

@ -1,4 +1,22 @@
from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X from spacy.symbols import (
NOUN,
VERB,
AUX,
ADJ,
ADV,
PRON,
DET,
ADP,
SCONJ,
CCONJ,
PART,
INTJ,
NUM,
PROPN,
PUNCT,
SYM,
X,
)
TAG_MAP = { TAG_MAP = {
"NOUN": {"pos": NOUN}, "NOUN": {"pos": NOUN},

View File

@ -1,5 +1,6 @@
from spacy.symbols import ORTH, NORM from spacy.symbols import ORTH, NORM
def make_variants(base, first_norm, second_orth, second_norm): def make_variants(base, first_norm, second_orth, second_norm):
return { return {
base: [ base: [
@ -7,14 +8,16 @@ def make_variants(base, first_norm, second_orth, second_norm):
{ORTH: second_orth, NORM: second_norm}, {ORTH: second_orth, NORM: second_norm},
], ],
base.capitalize(): [ base.capitalize(): [
{ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()}, {
ORTH: base.split("'")[0].capitalize() + "'",
NORM: first_norm.capitalize(),
},
{ORTH: second_orth, NORM: second_norm}, {ORTH: second_orth, NORM: second_norm},
] ],
} }
TOKENIZER_EXCEPTIONS = {
"Dr.": [{ORTH: "Dr."}] TOKENIZER_EXCEPTIONS = {"Dr.": [{ORTH: "Dr."}]}
}
# Apostrophe forms # Apostrophe forms
TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap")) TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
@ -29,93 +32,95 @@ TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap")) TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
# Non-apostrophe contractions (with capitalized variants) # Non-apostrophe contractions (with capitalized variants)
TOKENIZER_EXCEPTIONS.update({ TOKENIZER_EXCEPTIONS.update(
"map": [ {
{ORTH: "m", NORM: "mwen"}, "map": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "m", NORM: "mwen"},
], {ORTH: "ap", NORM: "ap"},
"Map": [ ],
{ORTH: "M", NORM: "Mwen"}, "Map": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "M", NORM: "Mwen"},
], {ORTH: "ap", NORM: "ap"},
"lem": [ ],
{ORTH: "le", NORM: "le"}, "lem": [
{ORTH: "m", NORM: "mwen"}, {ORTH: "le", NORM: "le"},
], {ORTH: "m", NORM: "mwen"},
"Lem": [ ],
{ORTH: "Le", NORM: "Le"}, "Lem": [
{ORTH: "m", NORM: "mwen"}, {ORTH: "Le", NORM: "Le"},
], {ORTH: "m", NORM: "mwen"},
"lew": [ ],
{ORTH: "le", NORM: "le"}, "lew": [
{ORTH: "w", NORM: "ou"}, {ORTH: "le", NORM: "le"},
], {ORTH: "w", NORM: "ou"},
"Lew": [ ],
{ORTH: "Le", NORM: "Le"}, "Lew": [
{ORTH: "w", NORM: "ou"}, {ORTH: "Le", NORM: "Le"},
], {ORTH: "w", NORM: "ou"},
"nap": [ ],
{ORTH: "n", NORM: "nou"}, "nap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "n", NORM: "nou"},
], {ORTH: "ap", NORM: "ap"},
"Nap": [ ],
{ORTH: "N", NORM: "Nou"}, "Nap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "N", NORM: "Nou"},
], {ORTH: "ap", NORM: "ap"},
"lap": [ ],
{ORTH: "l", NORM: "li"}, "lap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "l", NORM: "li"},
], {ORTH: "ap", NORM: "ap"},
"Lap": [ ],
{ORTH: "L", NORM: "Li"}, "Lap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "L", NORM: "Li"},
], {ORTH: "ap", NORM: "ap"},
"yap": [ ],
{ORTH: "y", NORM: "yo"}, "yap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "y", NORM: "yo"},
], {ORTH: "ap", NORM: "ap"},
"Yap": [ ],
{ORTH: "Y", NORM: "Yo"}, "Yap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "Y", NORM: "Yo"},
], {ORTH: "ap", NORM: "ap"},
"mte": [ ],
{ORTH: "m", NORM: "mwen"}, "mte": [
{ORTH: "te", NORM: "te"}, {ORTH: "m", NORM: "mwen"},
], {ORTH: "te", NORM: "te"},
"Mte": [ ],
{ORTH: "M", NORM: "Mwen"}, "Mte": [
{ORTH: "te", NORM: "te"}, {ORTH: "M", NORM: "Mwen"},
], {ORTH: "te", NORM: "te"},
"mpral": [ ],
{ORTH: "m", NORM: "mwen"}, "mpral": [
{ORTH: "pral", NORM: "pral"}, {ORTH: "m", NORM: "mwen"},
], {ORTH: "pral", NORM: "pral"},
"Mpral": [ ],
{ORTH: "M", NORM: "Mwen"}, "Mpral": [
{ORTH: "pral", NORM: "pral"}, {ORTH: "M", NORM: "Mwen"},
], {ORTH: "pral", NORM: "pral"},
"wap": [ ],
{ORTH: "w", NORM: "ou"}, "wap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "w", NORM: "ou"},
], {ORTH: "ap", NORM: "ap"},
"Wap": [ ],
{ORTH: "W", NORM: "Ou"}, "Wap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "W", NORM: "Ou"},
], {ORTH: "ap", NORM: "ap"},
"kap": [ ],
{ORTH: "k", NORM: "ki"}, "kap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "k", NORM: "ki"},
], {ORTH: "ap", NORM: "ap"},
"Kap": [ ],
{ORTH: "K", NORM: "Ki"}, "Kap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "K", NORM: "Ki"},
], {ORTH: "ap", NORM: "ap"},
"tap": [ ],
{ORTH: "t", NORM: "te"}, "tap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "t", NORM: "te"},
], {ORTH: "ap", NORM: "ap"},
"Tap": [ ],
{ORTH: "T", NORM: "Te"}, "Tap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "T", NORM: "Te"},
], {ORTH: "ap", NORM: "ap"},
}) ],
}
)

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Az Apple egy brit startup vásárlását tervezi 1 milliárd dollár értékben.", "Az Apple egy brit startup vásárlását tervezi 1 milliárd dollár értékben.",
"San Francisco vezetése mérlegeli a járdát használó szállító robotok betiltását.", "San Francisco vezetése mérlegeli a járdát használó szállító robotok betiltását.",

View File

@ -11,7 +11,7 @@ from ..char_classes import (
) )
# removing ° from the special icons to keep e.g. 99° as one token # removing ° from the special icons to keep e.g. 99° as one token
_concat_icons = CONCAT_ICONS.replace("\u00B0", "") _concat_icons = CONCAT_ICONS.replace("\u00b0", "")
_currency = r"\$¢£€¥฿" _currency = r"\$¢£€¥฿"
_quotes = CONCAT_QUOTES.replace("'", "") _quotes = CONCAT_QUOTES.replace("'", "")

View File

@ -4,7 +4,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։", "Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։",
"Ո՞վ է Ֆրանսիայի նախագահը։", "Ո՞վ է Ֆրանսիայի նախագահը։",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Indonesia merupakan negara kepulauan yang kaya akan budaya.", "Indonesia merupakan negara kepulauan yang kaya akan budaya.",
"Berapa banyak warga yang dibutuhkan saat kerja bakti?", "Berapa banyak warga yang dibutuhkan saat kerja bakti?",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Apple vuole comprare una startup del Regno Unito per un miliardo di dollari", "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
"Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori", "Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",

View File

@ -102,9 +102,9 @@ class JapaneseTokenizer(DummyTokenizer):
token.dictionary_form(), # lemma token.dictionary_form(), # lemma
token.normalized_form(), token.normalized_form(),
token.reading_form(), token.reading_form(),
sub_tokens_list[idx] (
if sub_tokens_list sub_tokens_list[idx] if sub_tokens_list else None
else None, # user_data['sub_tokens'] ), # user_data['sub_tokens']
) )
for idx, token in enumerate(sudachipy_tokens) for idx, token in enumerate(sudachipy_tokens)
if len(token.surface()) > 0 if len(token.surface()) > 0

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"アップルがイギリスの新興企業を10億ドルで購入を検討", "アップルがイギリスの新興企業を10億ドルで購入を検討",
"自動運転車の損害賠償責任、自動車メーカーに一定の負担を求める", "自動運転車の損害賠償責任、自動車メーカーに一定の負担を求める",

View File

@ -25,7 +25,9 @@ TAG_MAP = {
# Universal Dependencies Mapping: (Some of the entries in this mapping are updated to v2.6 in the list below) # Universal Dependencies Mapping: (Some of the entries in this mapping are updated to v2.6 in the list below)
# http://universaldependencies.org/ja/overview/morphology.html # http://universaldependencies.org/ja/overview/morphology.html
# http://universaldependencies.org/ja/pos/all.html # http://universaldependencies.org/ja/pos/all.html
"記号-一般": {POS: NOUN}, # this includes characters used to represent sounds like ドレミ "記号-一般": {
POS: NOUN
}, # this includes characters used to represent sounds like ドレミ
"記号-文字": { "記号-文字": {
POS: NOUN POS: NOUN
}, # this is for Greek and Latin characters having some meanings, or used as symbols, as in math }, # this is for Greek and Latin characters having some meanings, or used as symbols, as in math
@ -72,7 +74,9 @@ TAG_MAP = {
"名詞-固有名詞-地名-国": {POS: PROPN}, # country name "名詞-固有名詞-地名-国": {POS: PROPN}, # country name
"名詞-助動詞語幹": {POS: AUX}, "名詞-助動詞語幹": {POS: AUX},
"名詞-数詞": {POS: NUM}, # includes Chinese numerals "名詞-数詞": {POS: NUM}, # includes Chinese numerals
"名詞-普通名詞-サ変可能": {POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun "名詞-普通名詞-サ変可能": {
POS: NOUN
}, # XXX: sometimes VERB in UDv2; suru-verb noun
"名詞-普通名詞-サ変形状詞可能": {POS: NOUN}, "名詞-普通名詞-サ変形状詞可能": {POS: NOUN},
"名詞-普通名詞-一般": {POS: NOUN}, "名詞-普通名詞-一般": {POS: NOUN},
"名詞-普通名詞-形状詞可能": {POS: NOUN}, # XXX: sometimes ADJ in UDv2 "名詞-普通名詞-形状詞可能": {POS: NOUN}, # XXX: sometimes ADJ in UDv2

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"ಆಪಲ್ ಒಂದು ಯು.ಕೆ. ಸ್ಟಾರ್ಟ್ಅಪ್ ಅನ್ನು ೧ ಶತಕೋಟಿ ಡಾಲರ್ಗಳಿಗೆ ಖರೀದಿಸಲು ನೋಡುತ್ತಿದೆ.", "ಆಪಲ್ ಒಂದು ಯು.ಕೆ. ಸ್ಟಾರ್ಟ್ಅಪ್ ಅನ್ನು ೧ ಶತಕೋಟಿ ಡಾಲರ್ಗಳಿಗೆ ಖರೀದಿಸಲು ನೋಡುತ್ತಿದೆ.",
"ಸ್ವಾಯತ್ತ ಕಾರುಗಳು ವಿಮಾ ಹೊಣೆಗಾರಿಕೆಯನ್ನು ತಯಾರಕರ ಕಡೆಗೆ ಬದಲಾಯಿಸುತ್ತವೆ.", "ಸ್ವಾಯತ್ತ ಕಾರುಗಳು ವಿಮಾ ಹೊಣೆಗಾರಿಕೆಯನ್ನು ತಯಾರಕರ ಕಡೆಗೆ ಬದಲಾಯಿಸುತ್ತವೆ.",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Sciusciâ e sciorbî no se peu.", "Sciusciâ e sciorbî no se peu.",
"Graçie di çetroin, che me son arrivæ.", "Graçie di çetroin, che me son arrivæ.",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Jaunikis pirmąją vestuvinę naktį iškeitė į areštinės gultą", "Jaunikis pirmąją vestuvinę naktį iškeitė į areštinės gultą",
"Bepiločiai automobiliai išnaikins vairavimo mokyklas, autoservisus ir eismo nelaimes", "Bepiločiai automobiliai išnaikins vairavimo mokyklas, autoservisus ir eismo nelaimes",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"അനാവശ്യമായി കണ്ണിലും മൂക്കിലും വായിലും സ്പർശിക്കാതിരിക്കുക", "അനാവശ്യമായി കണ്ണിലും മൂക്കിലും വായിലും സ്പർശിക്കാതിരിക്കുക",
"പൊതുരംഗത്ത് മലയാള ഭാഷയുടെ സമഗ്രപുരോഗതി ലക്ഷ്യമാക്കി പ്രവർത്തിക്കുന്ന സംഘടനയായ മലയാളഐക്യവേദിയുടെ വിദ്യാർത്ഥിക്കൂട്ടായ്മയാണ് വിദ്യാർത്ഥി മലയാളവേദി", "പൊതുരംഗത്ത് മലയാള ഭാഷയുടെ സമഗ്രപുരോഗതി ലക്ഷ്യമാക്കി പ്രവർത്തിക്കുന്ന സംഘടനയായ മലയാളഐക്യവേദിയുടെ വിദ്യാർത്ഥിക്കൂട്ടായ്മയാണ് വിദ്യാർത്ഥി മലയാളവേദി",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Malaysia ialah sebuah negara yang terletak di Asia Tenggara.", "Malaysia ialah sebuah negara yang terletak di Asia Tenggara.",
"Berapa banyak pelajar yang akan menghadiri majlis perpisahan sekolah?", "Berapa banyak pelajar yang akan menghadiri majlis perpisahan sekolah?",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar.", "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar.",
"Selvkjørende biler flytter forsikringsansvaret over på produsentene.", "Selvkjørende biler flytter forsikringsansvaret over på produsentene.",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"एप्पलले अमेरिकी स्टार्टअप १ अर्ब डलरमा किन्ने सोच्दै छ", "एप्पलले अमेरिकी स्टार्टअप १ अर्ब डलरमा किन्ने सोच्दै छ",
"स्वायत्त कारहरूले बीमा दायित्व निर्माताहरु तिर बदल्छन्", "स्वायत्त कारहरूले बीमा दायित्व निर्माताहरु तिर बदल्छन्",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Apple overweegt om voor 1 miljard een U.K. startup te kopen", "Apple overweegt om voor 1 miljard een U.K. startup te kopen",
"Autonome auto's verschuiven de verzekeringverantwoordelijkheid naar producenten", "Autonome auto's verschuiven de verzekeringverantwoordelijkheid naar producenten",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
# sentences taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/) # sentences taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/)
sentences = [ sentences = [
"Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.", "Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Poczuł przyjemną woń mocnej kawy.", "Poczuł przyjemną woń mocnej kawy.",
"Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.", "Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares", "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares",
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes." "Carros autônomos empurram a responsabilidade do seguro para os fabricantes."

View File

@ -7,7 +7,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Apple plănuiește să cumpere o companie britanică pentru un miliard de dolari", "Apple plănuiește să cumpere o companie britanică pentru un miliard de dolari",
"Municipalitatea din San Francisco ia în calcul interzicerea roboților curieri pe trotuar", "Municipalitatea din San Francisco ia în calcul interzicerea roboților curieri pe trotuar",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
# Translations from English: # Translations from English:
"Apple рассматривает возможность покупки стартапа из Соединённого Королевства за $1 млрд", "Apple рассматривает возможность покупки стартапа из Соединённого Королевства за $1 млрд",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"अभ्यावहति कल्याणं विविधं वाक् सुभाषिता ।", "अभ्यावहति कल्याणं विविधं वाक् सुभाषिता ।",
"मनसि व्याकुले चक्षुः पश्यन्नपि न पश्यति ।", "मनसि व्याकुले चक्षुः पश्यन्नपि न पश्यति ।",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"මෙය වාක්‍යයකි.", "මෙය වාක්‍යයකි.",
"ඔබ කවුද?", "ඔබ කවුද?",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Ardevop, s.r.o. je malá startup firma na území SR.", "Ardevop, s.r.o. je malá startup firma na území SR.",
"Samojazdiace autá presúvajú poistnú zodpovednosť na výrobcov automobilov.", "Samojazdiace autá presúvajú poistnú zodpovednosť na výrobcov automobilov.",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Apple načrtuje nakup britanskega startupa za 1 bilijon dolarjev", "Apple načrtuje nakup britanskega startupa za 1 bilijon dolarjev",
"France Prešeren je umrl 8. februarja 1849 v Kranju", "France Prešeren je umrl 8. februarja 1849 v Kranju",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Apple po shqyrton blerjen e nje shoqërie të U.K. për 1 miliard dollarë", "Apple po shqyrton blerjen e nje shoqërie të U.K. për 1 miliard dollarë",
"Makinat autonome ndryshojnë përgjegjësinë e sigurimit ndaj prodhuesve", "Makinat autonome ndryshojnë përgjegjësinë e sigurimit ndaj prodhuesve",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
# Translations from English # Translations from English
"Apple планира куповину америчког стартапа за $1 милијарду.", "Apple планира куповину америчког стартапа за $1 милијарду.",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Apple överväger att köpa brittisk startup för 1 miljard dollar.", "Apple överväger att köpa brittisk startup för 1 miljard dollar.",
"Självkörande bilar förskjuter försäkringsansvar mot tillverkare.", "Självkörande bilar förskjuter försäkringsansvar mot tillverkare.",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"கிறிஸ்துமஸ் மற்றும் இனிய புத்தாண்டு வாழ்த்துக்கள்", "கிறிஸ்துமஸ் மற்றும் இனிய புத்தாண்டு வாழ்த்துக்கள்",
"எனக்கு என் குழந்தைப் பருவம் நினைவிருக்கிறது", "எனக்கு என் குழந்தைப் பருவம் நினைவிருக்கிறது",

View File

@ -7,7 +7,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"ఆపిల్ 1 బిలియన్ డాలర్స్ కి యూ.కె. స్టార్ట్అప్ ని కొనాలని అనుకుంటుంది.", "ఆపిల్ 1 బిలియన్ డాలర్స్ కి యూ.కె. స్టార్ట్అప్ ని కొనాలని అనుకుంటుంది.",
"ఆటోనోమోస్ కార్లు భీమా బాధ్యతను తయారీదారులపైకి మళ్లిస్తాయి.", "ఆటోనోమోస్ కార్లు భీమా బాధ్యతను తయారీదారులపైకి మళ్లిస్తాయి.",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"አፕል ብዩኬ ትርከብ ንግድ ብ1 ቢሊዮን ዶላር ንምግዛዕ ሐሲባ።", "አፕል ብዩኬ ትርከብ ንግድ ብ1 ቢሊዮን ዶላር ንምግዛዕ ሐሲባ።",
"ፈላማይ ክታበት ኮቪድ 19 ተጀሚሩ፤ሓዱሽ ተስፋ ሂቡ ኣሎ", "ፈላማይ ክታበት ኮቪድ 19 ተጀሚሩ፤ሓዱሽ ተስፋ ሂቡ ኣሎ",

View File

@ -4,7 +4,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion", "Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion",
"Johannesburg ke toropo e kgolo mo Afrika Borwa.", "Johannesburg ke toropo e kgolo mo Afrika Borwa.",

View File

@ -4,7 +4,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Neredesin?", "Neredesin?",
"Neredesiniz?", "Neredesiniz?",

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Ніч на середу буде морозною.", "Ніч на середу буде морозною.",
"Чим кращі книги ти читав, тим гірше спиш.", # Serhiy Zhadan "Чим кращі книги ти читав, тим гірше спиш.", # Serhiy Zhadan

View File

@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"اردو ہے جس کا نام ہم جانتے ہیں داغ", "اردو ہے جس کا نام ہم جانتے ہیں داغ",
"سارے جہاں میں دھوم ہماری زباں کی ہے", "سارے جہاں میں دھوم ہماری زباں کی ہے",

View File

@ -4,7 +4,6 @@ Example sentences to test spaCy and its language models.
>>> docs = nlp.pipe(sentences) >>> docs = nlp.pipe(sentences)
""" """
sentences = [ sentences = [
"Đây là đâu, tôi là ai?", "Đây là đâu, tôi là ai?",
"Căn phòng có nhiều cửa sổ nên nó khá sáng", "Căn phòng có nhiều cửa sổ nên nó khá sáng",

View File

@ -1519,8 +1519,7 @@ class Language:
disable: Iterable[str] = ..., disable: Iterable[str] = ...,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = ..., component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
n_process: int = ..., n_process: int = ...,
) -> Iterator[Doc]: ) -> Iterator[Doc]: ...
...
@overload @overload
def pipe( # noqa: F811 def pipe( # noqa: F811
@ -1532,8 +1531,7 @@ class Language:
disable: Iterable[str] = ..., disable: Iterable[str] = ...,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = ..., component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
n_process: int = ..., n_process: int = ...,
) -> Iterator[Tuple[Doc, _AnyContext]]: ) -> Iterator[Tuple[Doc, _AnyContext]]: ...
...
def pipe( # noqa: F811 def pipe( # noqa: F811
self, self,
@ -1641,7 +1639,7 @@ class Language:
batch_size: int, batch_size: int,
) -> Iterator[Doc]: ) -> Iterator[Doc]:
def prepare_input( def prepare_input(
texts: Iterable[Union[str, Doc]] texts: Iterable[Union[str, Doc]],
) -> Iterable[Tuple[Union[str, bytes], _AnyContext]]: ) -> Iterable[Tuple[Union[str, bytes], _AnyContext]]:
# Serialize Doc inputs to bytes to avoid incurring pickling # Serialize Doc inputs to bytes to avoid incurring pickling
# overhead when they are passed to child processes. Also yield # overhead when they are passed to child processes. Also yield
@ -1943,9 +1941,9 @@ class Language:
) )
if "_sourced_vectors_hashes" not in nlp.meta: if "_sourced_vectors_hashes" not in nlp.meta:
nlp.meta["_sourced_vectors_hashes"] = {} nlp.meta["_sourced_vectors_hashes"] = {}
nlp.meta["_sourced_vectors_hashes"][ nlp.meta["_sourced_vectors_hashes"][pipe_name] = (
pipe_name source_nlp_vectors_hashes[model]
] = source_nlp_vectors_hashes[model] )
# Delete from cache if listeners were replaced # Delete from cache if listeners were replaced
if listeners_replaced: if listeners_replaced:
del source_nlps[model] del source_nlps[model]

View File

@ -51,9 +51,7 @@ class DependencyMatcher:
] = ... ] = ...
) -> None: ... ) -> None: ...
def has_key(self, key: Union[str, int]) -> bool: ... def has_key(self, key: Union[str, int]) -> bool: ...
def get( def get(self, key: Union[str, int], default: Optional[Any] = ...) -> Tuple[
self, key: Union[str, int], default: Optional[Any] = ...
) -> Tuple[
Optional[ Optional[
Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any] Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
], ],

View File

@ -7,7 +7,7 @@ from ..tokens import Doc
def FeatureExtractor( def FeatureExtractor(
columns: Union[List[str], List[int], List[Union[int, str]]] columns: Union[List[str], List[int], List[Union[int, str]]],
) -> Model[List[Doc], List[Ints2d]]: ) -> Model[List[Doc], List[Ints2d]]:
return Model("extract_features", forward, attrs={"columns": columns}) return Model("extract_features", forward, attrs={"columns": columns})

View File

@ -122,7 +122,7 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
return get_candidates return get_candidates
def create_candidates_batch() -> Callable[ def create_candidates_batch() -> (
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]
]: ):
return get_candidates_batch return get_candidates_batch

View File

@ -93,7 +93,7 @@ class EditTreeLemmatizer(TrainablePipe):
truths = [] truths = []
for eg in examples: for eg in examples:
eg_truths = [] eg_truths = []
for (predicted, gold_lemma) in zip( for predicted, gold_lemma in zip(
eg.predicted, eg.get_aligned("LEMMA", as_string=True) eg.predicted, eg.get_aligned("LEMMA", as_string=True)
): ):
if gold_lemma is None or gold_lemma == "": if gold_lemma is None or gold_lemma == "":

View File

@ -80,8 +80,7 @@ DEFAULT_SPANCAT_SINGLELABEL_MODEL = Config().from_str(
@runtime_checkable @runtime_checkable
class Suggester(Protocol): class Suggester(Protocol):
def __call__(self, docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged: def __call__(self, docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged: ...
...
def ngram_suggester( def ngram_suggester(

View File

@ -6,6 +6,7 @@ remain in their original locations, but decoration is moved here.
Component definitions and registrations are in spacy/pipeline/factories.py Component definitions and registrations are in spacy/pipeline/factories.py
""" """
# Global flag to track if registry has been populated # Global flag to track if registry has been populated
REGISTRY_POPULATED = False REGISTRY_POPULATED = False

View File

@ -141,7 +141,8 @@ def test_issue3869(sentence):
@pytest.mark.issue(3962) @pytest.mark.issue(3962)
def test_issue3962(en_vocab): def test_issue3962(en_vocab):
"""Ensure that as_doc does not result in out-of-bound access of tokens. """Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise.""" This is achieved by setting the head to itself if it would lie out of the span otherwise.
"""
# fmt: off # fmt: off
words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."] words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
heads = [1, 7, 1, 2, 7, 7, 7, 7, 9, 7, 7] heads = [1, 7, 1, 2, 7, 7, 7, 7, 9, 7, 7]
@ -180,7 +181,8 @@ def test_issue3962(en_vocab):
@pytest.mark.issue(3962) @pytest.mark.issue(3962)
def test_issue3962_long(en_vocab): def test_issue3962_long(en_vocab):
"""Ensure that as_doc does not result in out-of-bound access of tokens. """Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise.""" This is achieved by setting the head to itself if it would lie out of the span otherwise.
"""
# fmt: off # fmt: off
words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."] words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
heads = [1, 1, 1, 2, 1, 7, 7, 7, 9, 7, 7] heads = [1, 1, 1, 2, 1, 7, 7, 7, 9, 7, 7]

View File

@ -29,4 +29,16 @@ def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text):
def test_ht_tokenizer_full_sentence(ht_tokenizer): def test_ht_tokenizer_full_sentence(ht_tokenizer):
text = "Si'm ka vini, m'ap pale ak li." text = "Si'm ka vini, m'ap pale ak li."
tokens = [t.text for t in ht_tokenizer(text)] tokens = [t.text for t in ht_tokenizer(text)]
assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."] assert tokens == [
"Si",
"'m",
"ka",
"vini",
",",
"m'",
"ap",
"pale",
"ak",
"li",
".",
]

View File

@ -37,7 +37,9 @@ def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text):
assert len(tokens) == 5 assert len(tokens) == 5
@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]) @pytest.mark.parametrize(
"text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]
)
def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length): def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
tokens = ht_tokenizer(text) tokens = ht_tokenizer(text)
assert len(tokens) == length assert len(tokens) == length

View File

@ -16,7 +16,6 @@ Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre
assert len(tokens) == 84 assert len(tokens) == 84
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,length", "text,length",
[ [
@ -66,14 +65,14 @@ def test_ht_lex_attrs_capitals(word):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"word, expected", [ "word, expected",
[
("'m", "mwen"), ("'m", "mwen"),
("'n", "nou"), ("'n", "nou"),
("'l", "li"), ("'l", "li"),
("'y", "yo"), ("'y", "yo"),
("'w", "ou"), ("'w", "ou"),
] ],
) )
def test_ht_lex_attrs_norm_custom(word, expected): def test_ht_lex_attrs_norm_custom(word, expected):
assert norm_custom(word) == expected assert norm_custom(word) == expected

View File

@ -304,9 +304,11 @@ TESTS.extend([x for i, x in enumerate(EXTRA_TESTS) if i % 10 == 0])
SLOW_TESTS = [x for i, x in enumerate(EXTRA_TESTS) if i % 10 != 0] SLOW_TESTS = [x for i, x in enumerate(EXTRA_TESTS) if i % 10 != 0]
TESTS.extend( TESTS.extend(
[ [
pytest.param(x[0], x[1], marks=pytest.mark.slow()) (
if not isinstance(x[0], tuple) pytest.param(x[0], x[1], marks=pytest.mark.slow())
else x if not isinstance(x[0], tuple)
else x
)
for x in SLOW_TESTS for x in SLOW_TESTS
] ]
) )

View File

@ -544,7 +544,7 @@ def test_greedy_matching_longest(doc, text, pattern, longest):
matcher = Matcher(doc.vocab) matcher = Matcher(doc.vocab)
matcher.add("RULE", [pattern], greedy="LONGEST") matcher.add("RULE", [pattern], greedy="LONGEST")
matches = matcher(doc) matches = matcher(doc)
for (key, s, e) in matches: for key, s, e in matches:
assert doc[s:e].text == longest assert doc[s:e].text == longest

View File

@ -496,15 +496,15 @@ def test_el_pipe_configuration(nlp):
return [get_lowercased_candidates(kb, span) for span in spans] return [get_lowercased_candidates(kb, span) for span in spans]
@registry.misc("spacy.LowercaseCandidateGenerator.v1") @registry.misc("spacy.LowercaseCandidateGenerator.v1")
def create_candidates() -> Callable[ def create_candidates() -> (
[InMemoryLookupKB, "Span"], Iterable[Candidate] Callable[[InMemoryLookupKB, "Span"], Iterable[Candidate]]
]: ):
return get_lowercased_candidates return get_lowercased_candidates
@registry.misc("spacy.LowercaseCandidateBatchGenerator.v1") @registry.misc("spacy.LowercaseCandidateBatchGenerator.v1")
def create_candidates_batch() -> Callable[ def create_candidates_batch() -> (
[InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]] Callable[[InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]]]
]: ):
return get_lowercased_candidates_batch return get_lowercased_candidates_batch
# replace the pipe with a new one with with a different candidate generator # replace the pipe with a new one with with a different candidate generator

View File

@ -279,20 +279,17 @@ def test_pipe_factories_wrong_formats():
with pytest.raises(ValueError): with pytest.raises(ValueError):
# Decorator is not called # Decorator is not called
@Language.component @Language.component
def component(foo: int, bar: str): def component(foo: int, bar: str): ...
...
with pytest.raises(ValueError): with pytest.raises(ValueError):
# Decorator is not called # Decorator is not called
@Language.factory @Language.factory
def factory1(foo: int, bar: str): def factory1(foo: int, bar: str): ...
...
with pytest.raises(ValueError): with pytest.raises(ValueError):
# Factory function is missing "nlp" and "name" arguments # Factory function is missing "nlp" and "name" arguments
@Language.factory("test_pipe_factories_missing_args") @Language.factory("test_pipe_factories_missing_args")
def factory2(foo: int, bar: str): def factory2(foo: int, bar: str): ...
...
def test_pipe_factory_meta_config_cleanup(): def test_pipe_factory_meta_config_cleanup():
@ -329,8 +326,7 @@ def test_pipe_factories_empty_dict_default():
name = "test_pipe_factories_empty_dict_default" name = "test_pipe_factories_empty_dict_default"
@Language.factory(name, default_config={"foo": {}}) @Language.factory(name, default_config={"foo": {}})
def factory(nlp: Language, name: str, foo: dict): def factory(nlp: Language, name: str, foo: dict): ...
...
nlp = Language() nlp = Language()
nlp.create_pipe(name) nlp.create_pipe(name)
@ -549,11 +545,9 @@ def test_pipe_factories_from_source_config():
class PipeFactoriesIdempotent: class PipeFactoriesIdempotent:
def __init__(self, nlp, name): def __init__(self, nlp, name): ...
...
def __call__(self, doc): def __call__(self, doc): ...
...
@pytest.mark.parametrize( @pytest.mark.parametrize(

View File

@ -874,7 +874,8 @@ def test_textcat_eval_missing(multi_label: bool, spring_p: float):
def test_textcat_loss(multi_label: bool, expected_loss: float): def test_textcat_loss(multi_label: bool, expected_loss: float):
""" """
multi-label: the missing 'spring' in gold_doc_2 doesn't incur an increase in loss multi-label: the missing 'spring' in gold_doc_2 doesn't incur an increase in loss
exclusive labels: the missing 'spring' in gold_doc_2 is interpreted as 0.0 and adds to the loss""" exclusive labels: the missing 'spring' in gold_doc_2 is interpreted as 0.0 and adds to the loss
"""
train_examples = [] train_examples = []
nlp = English() nlp = English()

View File

@ -890,7 +890,7 @@ def test_cli_find_threshold(capsys):
return docs return docs
def init_nlp( def init_nlp(
components: Tuple[Tuple[str, Dict[str, Any]], ...] = () components: Tuple[Tuple[str, Dict[str, Any]], ...] = (),
) -> Tuple[Language, List[Example]]: ) -> Tuple[Language, List[Example]]:
new_nlp = English() new_nlp = English()
new_nlp.add_pipe( # type: ignore new_nlp.add_pipe( # type: ignore

View File

@ -57,9 +57,7 @@ class Doc:
force: bool = ..., force: bool = ...,
) -> None: ... ) -> None: ...
@classmethod @classmethod
def get_extension( def get_extension(cls, name: str) -> Tuple[
cls, name: str
) -> Tuple[
Optional[Any], Optional[Any],
Optional[DocMethod], Optional[DocMethod],
Optional[Callable[[Doc], Any]], Optional[Callable[[Doc], Any]],
@ -68,9 +66,7 @@ class Doc:
@classmethod @classmethod
def has_extension(cls, name: str) -> bool: ... def has_extension(cls, name: str) -> bool: ...
@classmethod @classmethod
def remove_extension( def remove_extension(cls, name: str) -> Tuple[
cls, name: str
) -> Tuple[
Optional[Any], Optional[Any],
Optional[DocMethod], Optional[DocMethod],
Optional[Callable[[Doc], Any]], Optional[Callable[[Doc], Any]],

View File

@ -23,9 +23,7 @@ class Span:
force: bool = ..., force: bool = ...,
) -> None: ... ) -> None: ...
@classmethod @classmethod
def get_extension( def get_extension(cls, name: str) -> Tuple[
cls, name: str
) -> Tuple[
Optional[Any], Optional[Any],
Optional[SpanMethod], Optional[SpanMethod],
Optional[Callable[[Span], Any]], Optional[Callable[[Span], Any]],
@ -34,9 +32,7 @@ class Span:
@classmethod @classmethod
def has_extension(cls, name: str) -> bool: ... def has_extension(cls, name: str) -> bool: ...
@classmethod @classmethod
def remove_extension( def remove_extension(cls, name: str) -> Tuple[
cls, name: str
) -> Tuple[
Optional[Any], Optional[Any],
Optional[SpanMethod], Optional[SpanMethod],
Optional[Callable[[Span], Any]], Optional[Callable[[Span], Any]],

View File

@ -27,9 +27,7 @@ class Token:
force: bool = ..., force: bool = ...,
) -> None: ... ) -> None: ...
@classmethod @classmethod
def get_extension( def get_extension(cls, name: str) -> Tuple[
cls, name: str
) -> Tuple[
Optional[Any], Optional[Any],
Optional[TokenMethod], Optional[TokenMethod],
Optional[Callable[[Token], Any]], Optional[Callable[[Token], Any]],
@ -38,9 +36,7 @@ class Token:
@classmethod @classmethod
def has_extension(cls, name: str) -> bool: ... def has_extension(cls, name: str) -> bool: ...
@classmethod @classmethod
def remove_extension( def remove_extension(cls, name: str) -> Tuple[
cls, name: str
) -> Tuple[
Optional[Any], Optional[Any],
Optional[TokenMethod], Optional[TokenMethod],
Optional[Callable[[Token], Any]], Optional[Callable[[Token], Any]],

View File

@ -354,7 +354,7 @@ def update_meta(
def create_before_to_disk_callback( def create_before_to_disk_callback(
callback: Optional[Callable[["Language"], "Language"]] callback: Optional[Callable[["Language"], "Language"]],
) -> Callable[["Language"], "Language"]: ) -> Callable[["Language"], "Language"]:
from ..language import Language # noqa: F811 from ..language import Language # noqa: F811

View File

@ -30,11 +30,9 @@ class TrainableComponent(Protocol):
drop: float = 0.0, drop: float = 0.0,
sgd: Optional[Optimizer] = None, sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None losses: Optional[Dict[str, float]] = None
) -> Dict[str, float]: ) -> Dict[str, float]: ...
...
def finish_update(self, sgd: Optimizer) -> None: def finish_update(self, sgd: Optimizer) -> None: ...
...
@runtime_checkable @runtime_checkable
@ -44,8 +42,7 @@ class InitializableComponent(Protocol):
get_examples: Callable[[], Iterable["Example"]], get_examples: Callable[[], Iterable["Example"]],
nlp: "Language", nlp: "Language",
**kwargs: Any **kwargs: Any
): ): ...
...
@runtime_checkable @runtime_checkable
@ -55,11 +52,8 @@ class ListenedToComponent(Protocol):
listener_map: Dict[str, Sequence[Model]] listener_map: Dict[str, Sequence[Model]]
listening_components: List[str] listening_components: List[str]
def add_listener(self, listener: Model, component_name: str) -> None: def add_listener(self, listener: Model, component_name: str) -> None: ...
...
def remove_listener(self, listener: Model, component_name: str) -> bool: def remove_listener(self, listener: Model, component_name: str) -> bool: ...
...
def find_listeners(self, component) -> None: def find_listeners(self, component) -> None: ...
...

View File

@ -657,7 +657,7 @@ def load_model_from_config(
def get_sourced_components( def get_sourced_components(
config: Union[Dict[str, Any], Config] config: Union[Dict[str, Any], Config],
) -> Dict[str, Dict[str, Any]]: ) -> Dict[str, Dict[str, Any]]:
"""RETURNS (List[str]): All sourced components in the original config, """RETURNS (List[str]): All sourced components in the original config,
e.g. {"source": "en_core_web_sm"}. If the config contains a key e.g. {"source": "en_core_web_sm"}. If the config contains a key