mirror of
https://github.com/explosion/spaCy.git
synced 2025-11-17 16:26:09 +03:00
Reformat with black 25
This commit is contained in:
parent
68679d6f85
commit
54f54fc4cc
|
|
@ -225,13 +225,11 @@ def get_git_version(
|
|||
|
||||
|
||||
@overload
|
||||
def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]:
|
||||
...
|
||||
def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]: ...
|
||||
|
||||
|
||||
@overload
|
||||
def string_to_list(value: str, intify: Literal[True]) -> List[int]:
|
||||
...
|
||||
def string_to_list(value: str, intify: Literal[True]) -> List[int]: ...
|
||||
|
||||
|
||||
def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]:
|
||||
|
|
|
|||
|
|
@ -968,16 +968,14 @@ def _compile_gold(
|
|||
|
||||
|
||||
@overload
|
||||
def _format_labels(labels: Iterable[str], counts: Literal[False] = False) -> str:
|
||||
...
|
||||
def _format_labels(labels: Iterable[str], counts: Literal[False] = False) -> str: ...
|
||||
|
||||
|
||||
@overload
|
||||
def _format_labels(
|
||||
labels: Iterable[Tuple[str, int]],
|
||||
counts: Literal[True],
|
||||
) -> str:
|
||||
...
|
||||
) -> str: ...
|
||||
|
||||
|
||||
def _format_labels(
|
||||
|
|
|
|||
|
|
@ -157,9 +157,11 @@ def find_threshold(
|
|||
exits=1,
|
||||
)
|
||||
return {
|
||||
keys[0]: filter_config(config[keys[0]], keys[1:], full_key)
|
||||
if len(keys) > 1
|
||||
else config[keys[0]]
|
||||
keys[0]: (
|
||||
filter_config(config[keys[0]], keys[1:], full_key)
|
||||
if len(keys) > 1
|
||||
else config[keys[0]]
|
||||
)
|
||||
}
|
||||
|
||||
# Evaluate with varying threshold values.
|
||||
|
|
@ -216,12 +218,14 @@ def find_threshold(
|
|||
if len(set(scores.values())) == 1:
|
||||
wasabi.msg.warn(
|
||||
title="All scores are identical. Verify that all settings are correct.",
|
||||
text=""
|
||||
if (
|
||||
not isinstance(pipe, MultiLabel_TextCategorizer)
|
||||
or scores_key in ("cats_macro_f", "cats_micro_f")
|
||||
)
|
||||
else "Use `cats_macro_f` or `cats_micro_f` when optimizing the threshold for `textcat_multilabel`.",
|
||||
text=(
|
||||
""
|
||||
if (
|
||||
not isinstance(pipe, MultiLabel_TextCategorizer)
|
||||
or scores_key in ("cats_macro_f", "cats_micro_f")
|
||||
)
|
||||
else "Use `cats_macro_f` or `cats_micro_f` when optimizing the threshold for `textcat_multilabel`."
|
||||
),
|
||||
)
|
||||
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -195,9 +195,11 @@ def init_config(
|
|||
"Pipeline": ", ".join(pipeline),
|
||||
"Optimize for": optimize,
|
||||
"Hardware": variables["hardware"].upper(),
|
||||
"Transformer": template_vars.transformer.get("name") # type: ignore[attr-defined]
|
||||
if template_vars.use_transformer # type: ignore[attr-defined]
|
||||
else None,
|
||||
"Transformer": (
|
||||
template_vars.transformer.get("name") # type: ignore[attr-defined]
|
||||
if template_vars.use_transformer # type: ignore[attr-defined]
|
||||
else None
|
||||
),
|
||||
}
|
||||
msg.info("Generated config template specific for your use case")
|
||||
for label, value in use_case.items():
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
"""Helpers for Python and platform compatibility."""
|
||||
|
||||
import sys
|
||||
|
||||
from thinc.util import copy_array
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ spaCy's built in visualization suite for dependencies and named entities.
|
|||
DOCS: https://spacy.io/api/top-level#displacy
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from typing import Any, Callable, Dict, Iterable, Optional, Union
|
||||
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"አፕል የዩኬን ጅምር ድርጅት በ 1 ቢሊዮን ዶላር ለመግዛት አስቧል።",
|
||||
"የራስ ገዝ መኪኖች የኢንሹራንስ ኃላፊነትን ወደ አምራቾች ያዛውራሉ",
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Bu bir cümlədir.",
|
||||
"Necəsən?",
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ References:
|
|||
https://github.com/Alir3z4/stop-words - Original list, serves as a base.
|
||||
https://postvai.com/books/stop-dumi.pdf - Additions to the original list in order to improve it.
|
||||
"""
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
а автентичен аз ако ала
|
||||
|
|
|
|||
|
|
@ -5,5 +5,4 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = ["তুই খুব ভালো", "আজ আমরা ডাক্তার দেখতে যাবো", "আমি জানি না "]
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།",
|
||||
"ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple està buscant comprar una startup del Regne Unit per mil milions de dòlars",
|
||||
"Els cotxes autònoms deleguen la responsabilitat de l'assegurança als seus fabricants",
|
||||
|
|
|
|||
|
|
@ -277,10 +277,10 @@ _currency = (
|
|||
# These expressions contain various unicode variations, including characters
|
||||
# used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
|
||||
# conflicts, spaCy's base tokenizer should handle all of those by default
|
||||
_punct = (
|
||||
r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ۔ ؛ ٪"
|
||||
_punct = r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ۔ ؛ ٪"
|
||||
_quotes = (
|
||||
r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉 〈 〉 ⟦ ⟧'
|
||||
)
|
||||
_quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉 〈 〉 ⟦ ⟧'
|
||||
_hyphens = "- – — -- --- —— ~"
|
||||
|
||||
# Various symbols like dingbats, but also emoji
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Máma mele maso.",
|
||||
"Příliš žluťoučký kůň úpěl ďábelské ódy.",
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
Tokenizer Exceptions.
|
||||
Source: https://forkortelse.dk/ and various others.
|
||||
"""
|
||||
|
||||
from ...symbols import NORM, ORTH
|
||||
from ...util import update_exc
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
|
||||
"Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.",
|
||||
"Mi so tu jara derje spodoba.",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple is looking at buying U.K. startup for $1 billion",
|
||||
"Autonomous cars shift insurance liability toward manufacturers",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple está buscando comprar una startup del Reino Unido por mil millones de dólares.",
|
||||
"Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes.",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"این یک جمله نمونه می باشد.",
|
||||
"قرار ما، امروز ساعت ۲:۳۰ بعدازظهر هست!",
|
||||
|
|
|
|||
|
|
@ -100,9 +100,9 @@ conj_contraction_negations = [
|
|||
("eivat", "eivät"),
|
||||
("eivät", "eivät"),
|
||||
]
|
||||
for (base_lower, base_norm) in conj_contraction_bases:
|
||||
for base_lower, base_norm in conj_contraction_bases:
|
||||
for base in [base_lower, base_lower.title()]:
|
||||
for (suffix, suffix_norm) in conj_contraction_negations:
|
||||
for suffix, suffix_norm in conj_contraction_negations:
|
||||
_exc[base + suffix] = [
|
||||
{ORTH: base, NORM: base_norm},
|
||||
{ORTH: suffix, NORM: suffix_norm},
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple cherche à acheter une start-up anglaise pour 1 milliard de dollars",
|
||||
"Les voitures autonomes déplacent la responsabilité de l'assurance vers les constructeurs",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·",
|
||||
"εὐδαίμων Χαρίτων καὶ Μελάνιππος ἔφυ, θείας ἁγητῆρες ἐφαμερίοις φιλότατος.",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"લોકશાહી એ સરકારનું એક એવું તંત્ર છે જ્યાં નાગરિકો મત દ્વારા સત્તાનો ઉપયોગ કરે છે.",
|
||||
"તે ગુજરાત રાજ્યના ધરમપુર શહેરમાં આવેલું હતું",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל",
|
||||
'רה"מ הודיע כי יחרים טקס בחסותו',
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"एप्पल 1 अरब डॉलर के लिए यू.के. स्टार्टअप खरीदने पर विचार कर रहा है।",
|
||||
"स्वायत्त कारें निर्माताओं की ओर बीमा दायित्व रखतीं हैं।",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"To běšo wjelgin raźone a jo se wót luźi derje pśiwzeło. Tak som dožywiła wjelgin",
|
||||
"Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.",
|
||||
|
|
|
|||
|
|
@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults):
|
|||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
|
||||
|
||||
class HaitianCreole(Language):
|
||||
lang = "ht"
|
||||
Defaults = HaitianCreoleDefaults
|
||||
|
||||
|
||||
@HaitianCreole.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
|
|
@ -49,4 +51,5 @@ def make_lemmatizer(
|
|||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["HaitianCreole"]
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple ap panse achte yon demaraj nan Wayòm Ini pou $1 milya dola",
|
||||
"Machin otonòm fè responsablite asirans lan ale sou men fabrikan yo",
|
||||
|
|
|
|||
|
|
@ -49,6 +49,7 @@ NORM_MAP = {
|
|||
"P": "Pa",
|
||||
}
|
||||
|
||||
|
||||
def like_num(text):
|
||||
text = text.strip().lower()
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
|
|
@ -69,9 +70,11 @@ def like_num(text):
|
|||
return True
|
||||
return False
|
||||
|
||||
|
||||
def norm_custom(text):
|
||||
return NORM_MAP.get(text, text.lower())
|
||||
|
||||
|
||||
LEX_ATTRS = {
|
||||
LIKE_NUM: like_num,
|
||||
NORM: norm_custom,
|
||||
|
|
|
|||
|
|
@ -16,28 +16,43 @@ ELISION = "'’".replace(" ", "")
|
|||
_prefixes_elision = "m n l y t k w"
|
||||
_prefixes_elision += " " + _prefixes_elision.upper()
|
||||
|
||||
TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
|
||||
r"(?:({pe})[{el}])(?=[{a}])".format(
|
||||
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
|
||||
)
|
||||
]
|
||||
TOKENIZER_PREFIXES = (
|
||||
LIST_PUNCT
|
||||
+ LIST_QUOTES
|
||||
+ [
|
||||
r"(?:({pe})[{el}])(?=[{a}])".format(
|
||||
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
|
||||
r"(?<=[0-9])%", # numbers like 10%
|
||||
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
|
||||
r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
|
||||
r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
|
||||
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
|
||||
r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string
|
||||
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
|
||||
]
|
||||
TOKENIZER_SUFFIXES = (
|
||||
LIST_PUNCT
|
||||
+ LIST_QUOTES
|
||||
+ LIST_ELLIPSES
|
||||
+ [
|
||||
r"(?<=[0-9])%", # numbers like 10%
|
||||
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
|
||||
r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
|
||||
r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
|
||||
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
|
||||
r"(?<=[{a}])\.(?=\s|$)".format(
|
||||
a=ALPHA
|
||||
), # period after letter if space or end of string
|
||||
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
|
||||
]
|
||||
)
|
||||
|
||||
TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
|
||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
|
||||
]
|
||||
TOKENIZER_INFIXES = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
|
||||
]
|
||||
)
|
||||
|
|
|
|||
|
|
@ -39,8 +39,7 @@ sa san si swa si
|
|||
|
||||
men mèsi oswa osinon
|
||||
|
||||
"""
|
||||
.split()
|
||||
""".split()
|
||||
)
|
||||
|
||||
# Add common contractions, with and without apostrophe variants
|
||||
|
|
|
|||
|
|
@ -1,4 +1,22 @@
|
|||
from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
|
||||
from spacy.symbols import (
|
||||
NOUN,
|
||||
VERB,
|
||||
AUX,
|
||||
ADJ,
|
||||
ADV,
|
||||
PRON,
|
||||
DET,
|
||||
ADP,
|
||||
SCONJ,
|
||||
CCONJ,
|
||||
PART,
|
||||
INTJ,
|
||||
NUM,
|
||||
PROPN,
|
||||
PUNCT,
|
||||
SYM,
|
||||
X,
|
||||
)
|
||||
|
||||
TAG_MAP = {
|
||||
"NOUN": {"pos": NOUN},
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
from spacy.symbols import ORTH, NORM
|
||||
|
||||
|
||||
def make_variants(base, first_norm, second_orth, second_norm):
|
||||
return {
|
||||
base: [
|
||||
|
|
@ -7,14 +8,16 @@ def make_variants(base, first_norm, second_orth, second_norm):
|
|||
{ORTH: second_orth, NORM: second_norm},
|
||||
],
|
||||
base.capitalize(): [
|
||||
{ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()},
|
||||
{
|
||||
ORTH: base.split("'")[0].capitalize() + "'",
|
||||
NORM: first_norm.capitalize(),
|
||||
},
|
||||
{ORTH: second_orth, NORM: second_norm},
|
||||
]
|
||||
],
|
||||
}
|
||||
|
||||
TOKENIZER_EXCEPTIONS = {
|
||||
"Dr.": [{ORTH: "Dr."}]
|
||||
}
|
||||
|
||||
TOKENIZER_EXCEPTIONS = {"Dr.": [{ORTH: "Dr."}]}
|
||||
|
||||
# Apostrophe forms
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
|
||||
|
|
@ -29,93 +32,95 @@ TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
|
|||
TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
|
||||
|
||||
# Non-apostrophe contractions (with capitalized variants)
|
||||
TOKENIZER_EXCEPTIONS.update({
|
||||
"map": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Map": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"lem": [
|
||||
{ORTH: "le", NORM: "le"},
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
],
|
||||
"Lem": [
|
||||
{ORTH: "Le", NORM: "Le"},
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
],
|
||||
"lew": [
|
||||
{ORTH: "le", NORM: "le"},
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
],
|
||||
"Lew": [
|
||||
{ORTH: "Le", NORM: "Le"},
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
],
|
||||
"nap": [
|
||||
{ORTH: "n", NORM: "nou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Nap": [
|
||||
{ORTH: "N", NORM: "Nou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"lap": [
|
||||
{ORTH: "l", NORM: "li"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Lap": [
|
||||
{ORTH: "L", NORM: "Li"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"yap": [
|
||||
{ORTH: "y", NORM: "yo"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Yap": [
|
||||
{ORTH: "Y", NORM: "Yo"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"mte": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "te", NORM: "te"},
|
||||
],
|
||||
"Mte": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "te", NORM: "te"},
|
||||
],
|
||||
"mpral": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "pral", NORM: "pral"},
|
||||
],
|
||||
"Mpral": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "pral", NORM: "pral"},
|
||||
],
|
||||
"wap": [
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Wap": [
|
||||
{ORTH: "W", NORM: "Ou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"kap": [
|
||||
{ORTH: "k", NORM: "ki"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Kap": [
|
||||
{ORTH: "K", NORM: "Ki"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"tap": [
|
||||
{ORTH: "t", NORM: "te"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Tap": [
|
||||
{ORTH: "T", NORM: "Te"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
})
|
||||
TOKENIZER_EXCEPTIONS.update(
|
||||
{
|
||||
"map": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Map": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"lem": [
|
||||
{ORTH: "le", NORM: "le"},
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
],
|
||||
"Lem": [
|
||||
{ORTH: "Le", NORM: "Le"},
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
],
|
||||
"lew": [
|
||||
{ORTH: "le", NORM: "le"},
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
],
|
||||
"Lew": [
|
||||
{ORTH: "Le", NORM: "Le"},
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
],
|
||||
"nap": [
|
||||
{ORTH: "n", NORM: "nou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Nap": [
|
||||
{ORTH: "N", NORM: "Nou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"lap": [
|
||||
{ORTH: "l", NORM: "li"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Lap": [
|
||||
{ORTH: "L", NORM: "Li"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"yap": [
|
||||
{ORTH: "y", NORM: "yo"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Yap": [
|
||||
{ORTH: "Y", NORM: "Yo"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"mte": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "te", NORM: "te"},
|
||||
],
|
||||
"Mte": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "te", NORM: "te"},
|
||||
],
|
||||
"mpral": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "pral", NORM: "pral"},
|
||||
],
|
||||
"Mpral": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "pral", NORM: "pral"},
|
||||
],
|
||||
"wap": [
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Wap": [
|
||||
{ORTH: "W", NORM: "Ou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"kap": [
|
||||
{ORTH: "k", NORM: "ki"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Kap": [
|
||||
{ORTH: "K", NORM: "Ki"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"tap": [
|
||||
{ORTH: "t", NORM: "te"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Tap": [
|
||||
{ORTH: "T", NORM: "Te"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Az Apple egy brit startup vásárlását tervezi 1 milliárd dollár értékben.",
|
||||
"San Francisco vezetése mérlegeli a járdát használó szállító robotok betiltását.",
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ from ..char_classes import (
|
|||
)
|
||||
|
||||
# removing ° from the special icons to keep e.g. 99° as one token
|
||||
_concat_icons = CONCAT_ICONS.replace("\u00B0", "")
|
||||
_concat_icons = CONCAT_ICONS.replace("\u00b0", "")
|
||||
|
||||
_currency = r"\$¢£€¥฿"
|
||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։",
|
||||
"Ո՞վ է Ֆրանսիայի նախագահը։",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Indonesia merupakan negara kepulauan yang kaya akan budaya.",
|
||||
"Berapa banyak warga yang dibutuhkan saat kerja bakti?",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
|
||||
"Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
|
||||
|
|
|
|||
|
|
@ -102,9 +102,9 @@ class JapaneseTokenizer(DummyTokenizer):
|
|||
token.dictionary_form(), # lemma
|
||||
token.normalized_form(),
|
||||
token.reading_form(),
|
||||
sub_tokens_list[idx]
|
||||
if sub_tokens_list
|
||||
else None, # user_data['sub_tokens']
|
||||
(
|
||||
sub_tokens_list[idx] if sub_tokens_list else None
|
||||
), # user_data['sub_tokens']
|
||||
)
|
||||
for idx, token in enumerate(sudachipy_tokens)
|
||||
if len(token.surface()) > 0
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"アップルがイギリスの新興企業を10億ドルで購入を検討",
|
||||
"自動運転車の損害賠償責任、自動車メーカーに一定の負担を求める",
|
||||
|
|
|
|||
|
|
@ -25,7 +25,9 @@ TAG_MAP = {
|
|||
# Universal Dependencies Mapping: (Some of the entries in this mapping are updated to v2.6 in the list below)
|
||||
# http://universaldependencies.org/ja/overview/morphology.html
|
||||
# http://universaldependencies.org/ja/pos/all.html
|
||||
"記号-一般": {POS: NOUN}, # this includes characters used to represent sounds like ドレミ
|
||||
"記号-一般": {
|
||||
POS: NOUN
|
||||
}, # this includes characters used to represent sounds like ドレミ
|
||||
"記号-文字": {
|
||||
POS: NOUN
|
||||
}, # this is for Greek and Latin characters having some meanings, or used as symbols, as in math
|
||||
|
|
@ -72,7 +74,9 @@ TAG_MAP = {
|
|||
"名詞-固有名詞-地名-国": {POS: PROPN}, # country name
|
||||
"名詞-助動詞語幹": {POS: AUX},
|
||||
"名詞-数詞": {POS: NUM}, # includes Chinese numerals
|
||||
"名詞-普通名詞-サ変可能": {POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun
|
||||
"名詞-普通名詞-サ変可能": {
|
||||
POS: NOUN
|
||||
}, # XXX: sometimes VERB in UDv2; suru-verb noun
|
||||
"名詞-普通名詞-サ変形状詞可能": {POS: NOUN},
|
||||
"名詞-普通名詞-一般": {POS: NOUN},
|
||||
"名詞-普通名詞-形状詞可能": {POS: NOUN}, # XXX: sometimes ADJ in UDv2
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"ಆಪಲ್ ಒಂದು ಯು.ಕೆ. ಸ್ಟಾರ್ಟ್ಅಪ್ ಅನ್ನು ೧ ಶತಕೋಟಿ ಡಾಲರ್ಗಳಿಗೆ ಖರೀದಿಸಲು ನೋಡುತ್ತಿದೆ.",
|
||||
"ಸ್ವಾಯತ್ತ ಕಾರುಗಳು ವಿಮಾ ಹೊಣೆಗಾರಿಕೆಯನ್ನು ತಯಾರಕರ ಕಡೆಗೆ ಬದಲಾಯಿಸುತ್ತವೆ.",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Sciusciâ e sciorbî no se peu.",
|
||||
"Graçie di çetroin, che me son arrivæ.",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Jaunikis pirmąją vestuvinę naktį iškeitė į areštinės gultą",
|
||||
"Bepiločiai automobiliai išnaikins vairavimo mokyklas, autoservisus ir eismo nelaimes",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"അനാവശ്യമായി കണ്ണിലും മൂക്കിലും വായിലും സ്പർശിക്കാതിരിക്കുക",
|
||||
"പൊതുരംഗത്ത് മലയാള ഭാഷയുടെ സമഗ്രപുരോഗതി ലക്ഷ്യമാക്കി പ്രവർത്തിക്കുന്ന സംഘടനയായ മലയാളഐക്യവേദിയുടെ വിദ്യാർത്ഥിക്കൂട്ടായ്മയാണ് വിദ്യാർത്ഥി മലയാളവേദി",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Malaysia ialah sebuah negara yang terletak di Asia Tenggara.",
|
||||
"Berapa banyak pelajar yang akan menghadiri majlis perpisahan sekolah?",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar.",
|
||||
"Selvkjørende biler flytter forsikringsansvaret over på produsentene.",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"एप्पलले अमेरिकी स्टार्टअप १ अर्ब डलरमा किन्ने सोच्दै छ",
|
||||
"स्वायत्त कारहरूले बीमा दायित्व निर्माताहरु तिर बदल्छन्",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple overweegt om voor 1 miljard een U.K. startup te kopen",
|
||||
"Autonome auto's verschuiven de verzekeringverantwoordelijkheid naar producenten",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
# sentences taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/)
|
||||
sentences = [
|
||||
"Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Poczuł przyjemną woń mocnej kawy.",
|
||||
"Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares",
|
||||
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes."
|
||||
|
|
|
|||
|
|
@ -7,7 +7,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple plănuiește să cumpere o companie britanică pentru un miliard de dolari",
|
||||
"Municipalitatea din San Francisco ia în calcul interzicerea roboților curieri pe trotuar",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
# Translations from English:
|
||||
"Apple рассматривает возможность покупки стартапа из Соединённого Королевства за $1 млрд",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"अभ्यावहति कल्याणं विविधं वाक् सुभाषिता ।",
|
||||
"मनसि व्याकुले चक्षुः पश्यन्नपि न पश्यति ।",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"මෙය වාක්යයකි.",
|
||||
"ඔබ කවුද?",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Ardevop, s.r.o. je malá startup firma na území SR.",
|
||||
"Samojazdiace autá presúvajú poistnú zodpovednosť na výrobcov automobilov.",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple načrtuje nakup britanskega startupa za 1 bilijon dolarjev",
|
||||
"France Prešeren je umrl 8. februarja 1849 v Kranju",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple po shqyrton blerjen e nje shoqërie të U.K. për 1 miliard dollarë",
|
||||
"Makinat autonome ndryshojnë përgjegjësinë e sigurimit ndaj prodhuesve",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
# Translations from English
|
||||
"Apple планира куповину америчког стартапа за $1 милијарду.",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple överväger att köpa brittisk startup för 1 miljard dollar.",
|
||||
"Självkörande bilar förskjuter försäkringsansvar mot tillverkare.",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"கிறிஸ்துமஸ் மற்றும் இனிய புத்தாண்டு வாழ்த்துக்கள்",
|
||||
"எனக்கு என் குழந்தைப் பருவம் நினைவிருக்கிறது",
|
||||
|
|
|
|||
|
|
@ -7,7 +7,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"ఆపిల్ 1 బిలియన్ డాలర్స్ కి యూ.కె. స్టార్ట్అప్ ని కొనాలని అనుకుంటుంది.",
|
||||
"ఆటోనోమోస్ కార్లు భీమా బాధ్యతను తయారీదారులపైకి మళ్లిస్తాయి.",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"አፕል ብዩኬ ትርከብ ንግድ ብ1 ቢሊዮን ዶላር ንምግዛዕ ሐሲባ።",
|
||||
"ፈላማይ ክታበት ኮቪድ 19 ተጀሚሩ፤ሓዱሽ ተስፋ ሂቡ ኣሎ",
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion",
|
||||
"Johannesburg ke toropo e kgolo mo Afrika Borwa.",
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Neredesin?",
|
||||
"Neredesiniz?",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Ніч на середу буде морозною.",
|
||||
"Чим кращі книги ти читав, тим гірше спиш.", # Serhiy Zhadan
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"اردو ہے جس کا نام ہم جانتے ہیں داغ",
|
||||
"سارے جہاں میں دھوم ہماری زباں کی ہے",
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ Example sentences to test spaCy and its language models.
|
|||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Đây là đâu, tôi là ai?",
|
||||
"Căn phòng có nhiều cửa sổ nên nó khá sáng",
|
||||
|
|
|
|||
|
|
@ -1519,8 +1519,7 @@ class Language:
|
|||
disable: Iterable[str] = ...,
|
||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
|
||||
n_process: int = ...,
|
||||
) -> Iterator[Doc]:
|
||||
...
|
||||
) -> Iterator[Doc]: ...
|
||||
|
||||
@overload
|
||||
def pipe( # noqa: F811
|
||||
|
|
@ -1532,8 +1531,7 @@ class Language:
|
|||
disable: Iterable[str] = ...,
|
||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
|
||||
n_process: int = ...,
|
||||
) -> Iterator[Tuple[Doc, _AnyContext]]:
|
||||
...
|
||||
) -> Iterator[Tuple[Doc, _AnyContext]]: ...
|
||||
|
||||
def pipe( # noqa: F811
|
||||
self,
|
||||
|
|
@ -1641,7 +1639,7 @@ class Language:
|
|||
batch_size: int,
|
||||
) -> Iterator[Doc]:
|
||||
def prepare_input(
|
||||
texts: Iterable[Union[str, Doc]]
|
||||
texts: Iterable[Union[str, Doc]],
|
||||
) -> Iterable[Tuple[Union[str, bytes], _AnyContext]]:
|
||||
# Serialize Doc inputs to bytes to avoid incurring pickling
|
||||
# overhead when they are passed to child processes. Also yield
|
||||
|
|
@ -1943,9 +1941,9 @@ class Language:
|
|||
)
|
||||
if "_sourced_vectors_hashes" not in nlp.meta:
|
||||
nlp.meta["_sourced_vectors_hashes"] = {}
|
||||
nlp.meta["_sourced_vectors_hashes"][
|
||||
pipe_name
|
||||
] = source_nlp_vectors_hashes[model]
|
||||
nlp.meta["_sourced_vectors_hashes"][pipe_name] = (
|
||||
source_nlp_vectors_hashes[model]
|
||||
)
|
||||
# Delete from cache if listeners were replaced
|
||||
if listeners_replaced:
|
||||
del source_nlps[model]
|
||||
|
|
|
|||
|
|
@ -51,9 +51,7 @@ class DependencyMatcher:
|
|||
] = ...
|
||||
) -> None: ...
|
||||
def has_key(self, key: Union[str, int]) -> bool: ...
|
||||
def get(
|
||||
self, key: Union[str, int], default: Optional[Any] = ...
|
||||
) -> Tuple[
|
||||
def get(self, key: Union[str, int], default: Optional[Any] = ...) -> Tuple[
|
||||
Optional[
|
||||
Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
|
||||
],
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ from ..tokens import Doc
|
|||
|
||||
|
||||
def FeatureExtractor(
|
||||
columns: Union[List[str], List[int], List[Union[int, str]]]
|
||||
columns: Union[List[str], List[int], List[Union[int, str]]],
|
||||
) -> Model[List[Doc], List[Ints2d]]:
|
||||
return Model("extract_features", forward, attrs={"columns": columns})
|
||||
|
||||
|
|
|
|||
|
|
@ -122,7 +122,7 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
|
|||
return get_candidates
|
||||
|
||||
|
||||
def create_candidates_batch() -> Callable[
|
||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||
]:
|
||||
def create_candidates_batch() -> (
|
||||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]
|
||||
):
|
||||
return get_candidates_batch
|
||||
|
|
|
|||
|
|
@ -93,7 +93,7 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
truths = []
|
||||
for eg in examples:
|
||||
eg_truths = []
|
||||
for (predicted, gold_lemma) in zip(
|
||||
for predicted, gold_lemma in zip(
|
||||
eg.predicted, eg.get_aligned("LEMMA", as_string=True)
|
||||
):
|
||||
if gold_lemma is None or gold_lemma == "":
|
||||
|
|
|
|||
|
|
@ -80,8 +80,7 @@ DEFAULT_SPANCAT_SINGLELABEL_MODEL = Config().from_str(
|
|||
|
||||
@runtime_checkable
|
||||
class Suggester(Protocol):
|
||||
def __call__(self, docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged:
|
||||
...
|
||||
def __call__(self, docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged: ...
|
||||
|
||||
|
||||
def ngram_suggester(
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ remain in their original locations, but decoration is moved here.
|
|||
|
||||
Component definitions and registrations are in spacy/pipeline/factories.py
|
||||
"""
|
||||
|
||||
# Global flag to track if registry has been populated
|
||||
REGISTRY_POPULATED = False
|
||||
|
||||
|
|
|
|||
|
|
@ -141,7 +141,8 @@ def test_issue3869(sentence):
|
|||
@pytest.mark.issue(3962)
|
||||
def test_issue3962(en_vocab):
|
||||
"""Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||
This is achieved by setting the head to itself if it would lie out of the span otherwise.
|
||||
"""
|
||||
# fmt: off
|
||||
words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
|
||||
heads = [1, 7, 1, 2, 7, 7, 7, 7, 9, 7, 7]
|
||||
|
|
@ -180,7 +181,8 @@ def test_issue3962(en_vocab):
|
|||
@pytest.mark.issue(3962)
|
||||
def test_issue3962_long(en_vocab):
|
||||
"""Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||
This is achieved by setting the head to itself if it would lie out of the span otherwise.
|
||||
"""
|
||||
# fmt: off
|
||||
words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
|
||||
heads = [1, 1, 1, 2, 1, 7, 7, 7, 9, 7, 7]
|
||||
|
|
|
|||
|
|
@ -29,4 +29,16 @@ def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text):
|
|||
def test_ht_tokenizer_full_sentence(ht_tokenizer):
|
||||
text = "Si'm ka vini, m'ap pale ak li."
|
||||
tokens = [t.text for t in ht_tokenizer(text)]
|
||||
assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."]
|
||||
assert tokens == [
|
||||
"Si",
|
||||
"'m",
|
||||
"ka",
|
||||
"vini",
|
||||
",",
|
||||
"m'",
|
||||
"ap",
|
||||
"pale",
|
||||
"ak",
|
||||
"li",
|
||||
".",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -37,7 +37,9 @@ def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text):
|
|||
assert len(tokens) == 5
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)])
|
||||
@pytest.mark.parametrize(
|
||||
"text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]
|
||||
)
|
||||
def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
|
|
|||
|
|
@ -16,7 +16,6 @@ Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre
|
|||
assert len(tokens) == 84
|
||||
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,length",
|
||||
[
|
||||
|
|
@ -66,14 +65,14 @@ def test_ht_lex_attrs_capitals(word):
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word, expected", [
|
||||
"word, expected",
|
||||
[
|
||||
("'m", "mwen"),
|
||||
("'n", "nou"),
|
||||
("'l", "li"),
|
||||
("'y", "yo"),
|
||||
("'w", "ou"),
|
||||
]
|
||||
],
|
||||
)
|
||||
def test_ht_lex_attrs_norm_custom(word, expected):
|
||||
assert norm_custom(word) == expected
|
||||
|
||||
|
|
|
|||
|
|
@ -304,9 +304,11 @@ TESTS.extend([x for i, x in enumerate(EXTRA_TESTS) if i % 10 == 0])
|
|||
SLOW_TESTS = [x for i, x in enumerate(EXTRA_TESTS) if i % 10 != 0]
|
||||
TESTS.extend(
|
||||
[
|
||||
pytest.param(x[0], x[1], marks=pytest.mark.slow())
|
||||
if not isinstance(x[0], tuple)
|
||||
else x
|
||||
(
|
||||
pytest.param(x[0], x[1], marks=pytest.mark.slow())
|
||||
if not isinstance(x[0], tuple)
|
||||
else x
|
||||
)
|
||||
for x in SLOW_TESTS
|
||||
]
|
||||
)
|
||||
|
|
|
|||
|
|
@ -544,7 +544,7 @@ def test_greedy_matching_longest(doc, text, pattern, longest):
|
|||
matcher = Matcher(doc.vocab)
|
||||
matcher.add("RULE", [pattern], greedy="LONGEST")
|
||||
matches = matcher(doc)
|
||||
for (key, s, e) in matches:
|
||||
for key, s, e in matches:
|
||||
assert doc[s:e].text == longest
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -496,15 +496,15 @@ def test_el_pipe_configuration(nlp):
|
|||
return [get_lowercased_candidates(kb, span) for span in spans]
|
||||
|
||||
@registry.misc("spacy.LowercaseCandidateGenerator.v1")
|
||||
def create_candidates() -> Callable[
|
||||
[InMemoryLookupKB, "Span"], Iterable[Candidate]
|
||||
]:
|
||||
def create_candidates() -> (
|
||||
Callable[[InMemoryLookupKB, "Span"], Iterable[Candidate]]
|
||||
):
|
||||
return get_lowercased_candidates
|
||||
|
||||
@registry.misc("spacy.LowercaseCandidateBatchGenerator.v1")
|
||||
def create_candidates_batch() -> Callable[
|
||||
[InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]]
|
||||
]:
|
||||
def create_candidates_batch() -> (
|
||||
Callable[[InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]]]
|
||||
):
|
||||
return get_lowercased_candidates_batch
|
||||
|
||||
# replace the pipe with a new one with with a different candidate generator
|
||||
|
|
|
|||
|
|
@ -279,20 +279,17 @@ def test_pipe_factories_wrong_formats():
|
|||
with pytest.raises(ValueError):
|
||||
# Decorator is not called
|
||||
@Language.component
|
||||
def component(foo: int, bar: str):
|
||||
...
|
||||
def component(foo: int, bar: str): ...
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
# Decorator is not called
|
||||
@Language.factory
|
||||
def factory1(foo: int, bar: str):
|
||||
...
|
||||
def factory1(foo: int, bar: str): ...
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
# Factory function is missing "nlp" and "name" arguments
|
||||
@Language.factory("test_pipe_factories_missing_args")
|
||||
def factory2(foo: int, bar: str):
|
||||
...
|
||||
def factory2(foo: int, bar: str): ...
|
||||
|
||||
|
||||
def test_pipe_factory_meta_config_cleanup():
|
||||
|
|
@ -329,8 +326,7 @@ def test_pipe_factories_empty_dict_default():
|
|||
name = "test_pipe_factories_empty_dict_default"
|
||||
|
||||
@Language.factory(name, default_config={"foo": {}})
|
||||
def factory(nlp: Language, name: str, foo: dict):
|
||||
...
|
||||
def factory(nlp: Language, name: str, foo: dict): ...
|
||||
|
||||
nlp = Language()
|
||||
nlp.create_pipe(name)
|
||||
|
|
@ -549,11 +545,9 @@ def test_pipe_factories_from_source_config():
|
|||
|
||||
|
||||
class PipeFactoriesIdempotent:
|
||||
def __init__(self, nlp, name):
|
||||
...
|
||||
def __init__(self, nlp, name): ...
|
||||
|
||||
def __call__(self, doc):
|
||||
...
|
||||
def __call__(self, doc): ...
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
|
|
|||
|
|
@ -874,7 +874,8 @@ def test_textcat_eval_missing(multi_label: bool, spring_p: float):
|
|||
def test_textcat_loss(multi_label: bool, expected_loss: float):
|
||||
"""
|
||||
multi-label: the missing 'spring' in gold_doc_2 doesn't incur an increase in loss
|
||||
exclusive labels: the missing 'spring' in gold_doc_2 is interpreted as 0.0 and adds to the loss"""
|
||||
exclusive labels: the missing 'spring' in gold_doc_2 is interpreted as 0.0 and adds to the loss
|
||||
"""
|
||||
train_examples = []
|
||||
nlp = English()
|
||||
|
||||
|
|
|
|||
|
|
@ -890,7 +890,7 @@ def test_cli_find_threshold(capsys):
|
|||
return docs
|
||||
|
||||
def init_nlp(
|
||||
components: Tuple[Tuple[str, Dict[str, Any]], ...] = ()
|
||||
components: Tuple[Tuple[str, Dict[str, Any]], ...] = (),
|
||||
) -> Tuple[Language, List[Example]]:
|
||||
new_nlp = English()
|
||||
new_nlp.add_pipe( # type: ignore
|
||||
|
|
|
|||
|
|
@ -57,9 +57,7 @@ class Doc:
|
|||
force: bool = ...,
|
||||
) -> None: ...
|
||||
@classmethod
|
||||
def get_extension(
|
||||
cls, name: str
|
||||
) -> Tuple[
|
||||
def get_extension(cls, name: str) -> Tuple[
|
||||
Optional[Any],
|
||||
Optional[DocMethod],
|
||||
Optional[Callable[[Doc], Any]],
|
||||
|
|
@ -68,9 +66,7 @@ class Doc:
|
|||
@classmethod
|
||||
def has_extension(cls, name: str) -> bool: ...
|
||||
@classmethod
|
||||
def remove_extension(
|
||||
cls, name: str
|
||||
) -> Tuple[
|
||||
def remove_extension(cls, name: str) -> Tuple[
|
||||
Optional[Any],
|
||||
Optional[DocMethod],
|
||||
Optional[Callable[[Doc], Any]],
|
||||
|
|
|
|||
|
|
@ -23,9 +23,7 @@ class Span:
|
|||
force: bool = ...,
|
||||
) -> None: ...
|
||||
@classmethod
|
||||
def get_extension(
|
||||
cls, name: str
|
||||
) -> Tuple[
|
||||
def get_extension(cls, name: str) -> Tuple[
|
||||
Optional[Any],
|
||||
Optional[SpanMethod],
|
||||
Optional[Callable[[Span], Any]],
|
||||
|
|
@ -34,9 +32,7 @@ class Span:
|
|||
@classmethod
|
||||
def has_extension(cls, name: str) -> bool: ...
|
||||
@classmethod
|
||||
def remove_extension(
|
||||
cls, name: str
|
||||
) -> Tuple[
|
||||
def remove_extension(cls, name: str) -> Tuple[
|
||||
Optional[Any],
|
||||
Optional[SpanMethod],
|
||||
Optional[Callable[[Span], Any]],
|
||||
|
|
|
|||
|
|
@ -27,9 +27,7 @@ class Token:
|
|||
force: bool = ...,
|
||||
) -> None: ...
|
||||
@classmethod
|
||||
def get_extension(
|
||||
cls, name: str
|
||||
) -> Tuple[
|
||||
def get_extension(cls, name: str) -> Tuple[
|
||||
Optional[Any],
|
||||
Optional[TokenMethod],
|
||||
Optional[Callable[[Token], Any]],
|
||||
|
|
@ -38,9 +36,7 @@ class Token:
|
|||
@classmethod
|
||||
def has_extension(cls, name: str) -> bool: ...
|
||||
@classmethod
|
||||
def remove_extension(
|
||||
cls, name: str
|
||||
) -> Tuple[
|
||||
def remove_extension(cls, name: str) -> Tuple[
|
||||
Optional[Any],
|
||||
Optional[TokenMethod],
|
||||
Optional[Callable[[Token], Any]],
|
||||
|
|
|
|||
|
|
@ -354,7 +354,7 @@ def update_meta(
|
|||
|
||||
|
||||
def create_before_to_disk_callback(
|
||||
callback: Optional[Callable[["Language"], "Language"]]
|
||||
callback: Optional[Callable[["Language"], "Language"]],
|
||||
) -> Callable[["Language"], "Language"]:
|
||||
from ..language import Language # noqa: F811
|
||||
|
||||
|
|
|
|||
18
spacy/ty.py
18
spacy/ty.py
|
|
@ -30,11 +30,9 @@ class TrainableComponent(Protocol):
|
|||
drop: float = 0.0,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
losses: Optional[Dict[str, float]] = None
|
||||
) -> Dict[str, float]:
|
||||
...
|
||||
) -> Dict[str, float]: ...
|
||||
|
||||
def finish_update(self, sgd: Optimizer) -> None:
|
||||
...
|
||||
def finish_update(self, sgd: Optimizer) -> None: ...
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
|
|
@ -44,8 +42,7 @@ class InitializableComponent(Protocol):
|
|||
get_examples: Callable[[], Iterable["Example"]],
|
||||
nlp: "Language",
|
||||
**kwargs: Any
|
||||
):
|
||||
...
|
||||
): ...
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
|
|
@ -55,11 +52,8 @@ class ListenedToComponent(Protocol):
|
|||
listener_map: Dict[str, Sequence[Model]]
|
||||
listening_components: List[str]
|
||||
|
||||
def add_listener(self, listener: Model, component_name: str) -> None:
|
||||
...
|
||||
def add_listener(self, listener: Model, component_name: str) -> None: ...
|
||||
|
||||
def remove_listener(self, listener: Model, component_name: str) -> bool:
|
||||
...
|
||||
def remove_listener(self, listener: Model, component_name: str) -> bool: ...
|
||||
|
||||
def find_listeners(self, component) -> None:
|
||||
...
|
||||
def find_listeners(self, component) -> None: ...
|
||||
|
|
|
|||
|
|
@ -657,7 +657,7 @@ def load_model_from_config(
|
|||
|
||||
|
||||
def get_sourced_components(
|
||||
config: Union[Dict[str, Any], Config]
|
||||
config: Union[Dict[str, Any], Config],
|
||||
) -> Dict[str, Dict[str, Any]]:
|
||||
"""RETURNS (List[str]): All sourced components in the original config,
|
||||
e.g. {"source": "en_core_web_sm"}. If the config contains a key
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user