mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Tidy up and auto-format
This commit is contained in:
parent
817b0db521
commit
e6accb3a9e
|
@ -105,12 +105,15 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
|
||||||
|
|
||||||
|
|
||||||
def get_markdown(
|
def get_markdown(
|
||||||
data: Dict[str, Any], title: Optional[str] = None, exclude: Optional[List[str]] = None
|
data: Dict[str, Any],
|
||||||
|
title: Optional[str] = None,
|
||||||
|
exclude: Optional[List[str]] = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Get data in GitHub-flavoured Markdown format for issues etc.
|
"""Get data in GitHub-flavoured Markdown format for issues etc.
|
||||||
|
|
||||||
data (dict or list of tuples): Label/value pairs.
|
data (Dict[str, Any]): Label/value pairs.
|
||||||
title (str / None): Title, will be rendered as headline 2.
|
title (str): Optional title, will be rendered as headline 2.
|
||||||
|
exclude (List[str]): Names of keys to exclude.
|
||||||
RETURNS (str): The Markdown string.
|
RETURNS (str): The Markdown string.
|
||||||
"""
|
"""
|
||||||
md = MarkdownRenderer()
|
md = MarkdownRenderer()
|
||||||
|
|
|
@ -29,7 +29,9 @@ class Spanish(Language):
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool):
|
def make_lemmatizer(
|
||||||
|
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||||
|
):
|
||||||
return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -47,6 +47,7 @@ for exc_data in [ # "etc." abbreviations
|
||||||
{ORTH: "көч.", NORM: "көчөсү"},
|
{ORTH: "көч.", NORM: "көчөсү"},
|
||||||
{ORTH: "м-н", NORM: "менен"},
|
{ORTH: "м-н", NORM: "менен"},
|
||||||
{ORTH: "б-ча", NORM: "боюнча"},
|
{ORTH: "б-ча", NORM: "боюнча"},
|
||||||
]: _exc[exc_data[ORTH]] = [exc_data]
|
]:
|
||||||
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -9,7 +9,6 @@ class MacedonianLemmatizer(Lemmatizer):
|
||||||
def rule_lemmatize(self, token: Token) -> List[str]:
|
def rule_lemmatize(self, token: Token) -> List[str]:
|
||||||
string = token.text
|
string = token.text
|
||||||
univ_pos = token.pos_.lower()
|
univ_pos = token.pos_.lower()
|
||||||
morphology = token.morph.to_dict()
|
|
||||||
|
|
||||||
if univ_pos in ("", "eol", "space"):
|
if univ_pos in ("", "eol", "space"):
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
|
|
|
@ -1686,7 +1686,10 @@ class Language:
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
def replace_listeners(
|
def replace_listeners(
|
||||||
self, tok2vec_name: str, pipe_name: str, listeners: Iterable[str],
|
self,
|
||||||
|
tok2vec_name: str,
|
||||||
|
pipe_name: str,
|
||||||
|
listeners: Iterable[str],
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Find listener layers (connecting to a token-to-vector embedding
|
"""Find listener layers (connecting to a token-to-vector embedding
|
||||||
component) of a given pipeline component model and replace
|
component) of a given pipeline component model and replace
|
||||||
|
|
|
@ -135,7 +135,6 @@ class AttributeRuler(Pipe):
|
||||||
) from None
|
) from None
|
||||||
set_token_attrs(span[index], attrs)
|
set_token_attrs(span[index], attrs)
|
||||||
|
|
||||||
|
|
||||||
def load_from_tag_map(
|
def load_from_tag_map(
|
||||||
self, tag_map: Dict[str, Dict[Union[int, str], Union[int, str]]]
|
self, tag_map: Dict[str, Dict[Union[int, str], Union[int, str]]]
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
|
@ -217,7 +217,6 @@ class EntityLinker(TrainablePipe):
|
||||||
return losses
|
return losses
|
||||||
validate_examples(examples, "EntityLinker.update")
|
validate_examples(examples, "EntityLinker.update")
|
||||||
sentence_docs = []
|
sentence_docs = []
|
||||||
docs = [eg.predicted for eg in examples]
|
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
sentences = [s for s in eg.reference.sents]
|
sentences = [s for s in eg.reference.sents]
|
||||||
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
||||||
|
|
|
@ -1,10 +1,9 @@
|
||||||
import srsly
|
|
||||||
from thinc.api import Config
|
|
||||||
from typing import Dict, Any
|
from typing import Dict, Any
|
||||||
|
import srsly
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..matcher import Matcher
|
from ..matcher import Matcher
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..util import filter_spans
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -64,7 +63,7 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
|
||||||
merger = Matcher(doc.vocab)
|
merger = Matcher(doc.vocab)
|
||||||
merger.add("SUBTOK", [[{"DEP": label, "op": "+"}]])
|
merger.add("SUBTOK", [[{"DEP": label, "op": "+"}]])
|
||||||
matches = merger(doc)
|
matches = merger(doc)
|
||||||
spans = filter_spans([doc[start : end + 1] for _, start, end in matches])
|
spans = util.filter_spans([doc[start : end + 1] for _, start, end in matches])
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
for span in spans:
|
for span in spans:
|
||||||
retokenizer.merge(span)
|
retokenizer.merge(span)
|
||||||
|
@ -77,15 +76,9 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
|
||||||
retokenizes=True,
|
retokenizes=True,
|
||||||
)
|
)
|
||||||
def make_token_splitter(
|
def make_token_splitter(
|
||||||
nlp: Language,
|
nlp: Language, name: str, *, min_length=0, split_length=0,
|
||||||
name: str,
|
|
||||||
*,
|
|
||||||
min_length=0,
|
|
||||||
split_length=0,
|
|
||||||
):
|
):
|
||||||
return TokenSplitter(
|
return TokenSplitter(min_length=min_length, split_length=split_length)
|
||||||
min_length=min_length, split_length=split_length
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class TokenSplitter:
|
class TokenSplitter:
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from typing import Optional, List, Dict, Any, Callable, Iterable, Iterator, Union
|
from typing import Optional, List, Dict, Any, Callable, Iterable, Union, Tuple
|
||||||
from typing import Tuple
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
|
@ -641,7 +641,8 @@ def test_doc_noun_chunks_not_implemented():
|
||||||
nlp = MultiLanguage()
|
nlp = MultiLanguage()
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
with pytest.raises(NotImplementedError):
|
with pytest.raises(NotImplementedError):
|
||||||
chunks = list(doc.noun_chunks)
|
_ = list(doc.noun_chunks) # noqa: F841
|
||||||
|
|
||||||
|
|
||||||
def test_span_groups(en_tokenizer):
|
def test_span_groups(en_tokenizer):
|
||||||
doc = en_tokenizer("Some text about Colombia and the Czech Republic")
|
doc = en_tokenizer("Some text about Colombia and the Czech Republic")
|
||||||
|
|
|
@ -18,9 +18,7 @@ PUNC_INSIDE_WORDS_TESTS = [
|
||||||
('То"кой', 'То " кой'.split()),
|
('То"кой', 'То " кой'.split()),
|
||||||
]
|
]
|
||||||
|
|
||||||
MIXED_ORDINAL_NUMS_TESTS = [
|
MIXED_ORDINAL_NUMS_TESTS = [("Эртең 22-январь...", "Эртең 22 - январь ...".split())]
|
||||||
("Эртең 22-январь...", "Эртең 22 - январь ...".split())
|
|
||||||
]
|
|
||||||
|
|
||||||
ABBREV_TESTS = [
|
ABBREV_TESTS = [
|
||||||
("Маселе б-ча эртең келет", "Маселе б-ча эртең келет".split()),
|
("Маселе б-ча эртең келет", "Маселе б-ча эртең келет".split()),
|
||||||
|
|
|
@ -2,7 +2,6 @@ import pytest
|
||||||
import pickle
|
import pickle
|
||||||
import re
|
import re
|
||||||
import copy
|
import copy
|
||||||
import logging
|
|
||||||
from mock import Mock
|
from mock import Mock
|
||||||
from spacy.matcher import DependencyMatcher
|
from spacy.matcher import DependencyMatcher
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
@ -343,6 +342,5 @@ def test_dependency_matcher_long_matches(en_vocab, doc):
|
||||||
]
|
]
|
||||||
|
|
||||||
matcher = DependencyMatcher(en_vocab)
|
matcher = DependencyMatcher(en_vocab)
|
||||||
logger = logging.getLogger("spacy")
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
matcher.add("pattern", [pattern])
|
matcher.add("pattern", [pattern])
|
||||||
|
|
|
@ -322,4 +322,4 @@ def test_phrase_matcher_deprecated(en_vocab):
|
||||||
|
|
||||||
@pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"])
|
@pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"])
|
||||||
def test_phrase_matcher_sent_start(en_vocab, attr):
|
def test_phrase_matcher_sent_start(en_vocab, attr):
|
||||||
matcher = PhraseMatcher(en_vocab, attr=attr)
|
_ = PhraseMatcher(en_vocab, attr=attr) # noqa: F841
|
||||||
|
|
|
@ -6,7 +6,6 @@ from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.training.initialize import init_nlp
|
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
|
@ -1,6 +1,21 @@
|
||||||
import pytest
|
from spacy.util import filter_spans
|
||||||
from pydantic import ValidationError
|
from pydantic import ValidationError
|
||||||
from spacy.schemas import TokenPattern, TokenPatternSchema
|
from spacy.schemas import TokenPattern, TokenPatternSchema
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue6207(en_tokenizer):
|
||||||
|
doc = en_tokenizer("zero one two three four five six")
|
||||||
|
|
||||||
|
# Make spans
|
||||||
|
s1 = doc[:4]
|
||||||
|
s2 = doc[3:6] # overlaps with s1
|
||||||
|
s3 = doc[5:7] # overlaps with s2, not s1
|
||||||
|
|
||||||
|
result = filter_spans((s1, s2, s3))
|
||||||
|
assert s1 in result
|
||||||
|
assert s2 not in result
|
||||||
|
assert s3 in result
|
||||||
|
|
||||||
|
|
||||||
def test_issue6258():
|
def test_issue6258():
|
|
@ -1,15 +0,0 @@
|
||||||
from spacy.util import filter_spans
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue6207(en_tokenizer):
|
|
||||||
doc = en_tokenizer("zero one two three four five six")
|
|
||||||
|
|
||||||
# Make spans
|
|
||||||
s1 = doc[:4]
|
|
||||||
s2 = doc[3:6] # overlaps with s1
|
|
||||||
s3 = doc[5:7] # overlaps with s2, not s1
|
|
||||||
|
|
||||||
result = filter_spans((s1, s2, s3))
|
|
||||||
assert s1 in result
|
|
||||||
assert s2 not in result
|
|
||||||
assert s3 in result
|
|
|
@ -4,9 +4,8 @@ import numpy as np
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"sentence, start_idx,end_idx,label", [
|
"sentence, start_idx,end_idx,label",
|
||||||
('Welcome to Mumbai, my friend',11,17,'GPE')
|
[("Welcome to Mumbai, my friend", 11, 17, "GPE")],
|
||||||
]
|
|
||||||
)
|
)
|
||||||
def test_char_span_label(sentence, start_idx, end_idx, label):
|
def test_char_span_label(sentence, start_idx, end_idx, label):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
@ -14,10 +13,9 @@ def test_char_span_label(sentence, start_idx, end_idx, label):
|
||||||
span = doc[:].char_span(start_idx, end_idx, label=label)
|
span = doc[:].char_span(start_idx, end_idx, label=label)
|
||||||
assert span.label_ == label
|
assert span.label_ == label
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"sentence, start_idx,end_idx,kb_id", [
|
"sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)]
|
||||||
('Welcome to Mumbai, my friend',11,17,5)
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
def test_char_span_kb_id(sentence, start_idx, end_idx, kb_id):
|
def test_char_span_kb_id(sentence, start_idx, end_idx, kb_id):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
@ -25,10 +23,10 @@ def test_char_span_kb_id(sentence, start_idx, end_idx, kb_id):
|
||||||
span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id)
|
span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id)
|
||||||
assert span.kb_id == kb_id
|
assert span.kb_id == kb_id
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"sentence, start_idx,end_idx,vector", [
|
"sentence, start_idx,end_idx,vector",
|
||||||
('Welcome to Mumbai, my friend',11,17,np.array([0.1,0.2,0.3]))
|
[("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))],
|
||||||
]
|
|
||||||
)
|
)
|
||||||
def test_char_span_vector(sentence, start_idx, end_idx, vector):
|
def test_char_span_vector(sentence, start_idx, end_idx, vector):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
|
|
@ -186,6 +186,7 @@ def test_language_pipe_error_handler():
|
||||||
|
|
||||||
def test_language_pipe_error_handler_custom(en_vocab):
|
def test_language_pipe_error_handler_custom(en_vocab):
|
||||||
"""Test the error handling of a custom component that has no pipe method"""
|
"""Test the error handling of a custom component that has no pipe method"""
|
||||||
|
|
||||||
@Language.component("my_evil_component")
|
@Language.component("my_evil_component")
|
||||||
def evil_component(doc):
|
def evil_component(doc):
|
||||||
if "2" in doc.text:
|
if "2" in doc.text:
|
||||||
|
@ -194,6 +195,7 @@ def test_language_pipe_error_handler_custom(en_vocab):
|
||||||
|
|
||||||
def warn_error(proc_name, proc, docs, e):
|
def warn_error(proc_name, proc, docs, e):
|
||||||
from spacy.util import logger
|
from spacy.util import logger
|
||||||
|
|
||||||
logger.warning(f"Trouble with component {proc_name}.")
|
logger.warning(f"Trouble with component {proc_name}.")
|
||||||
|
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
@ -217,6 +219,7 @@ def test_language_pipe_error_handler_custom(en_vocab):
|
||||||
|
|
||||||
def test_language_pipe_error_handler_pipe(en_vocab):
|
def test_language_pipe_error_handler_pipe(en_vocab):
|
||||||
"""Test the error handling of a component's pipe method"""
|
"""Test the error handling of a component's pipe method"""
|
||||||
|
|
||||||
@Language.component("my_sentences")
|
@Language.component("my_sentences")
|
||||||
def perhaps_set_sentences(doc):
|
def perhaps_set_sentences(doc):
|
||||||
if not doc.text.startswith("4"):
|
if not doc.text.startswith("4"):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user