mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-23 15:54:13 +03:00
commit
14513f82da
|
@ -699,9 +699,7 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
|
||||||
return count
|
return count
|
||||||
|
|
||||||
|
|
||||||
def _get_labels_from_model(
|
def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]:
|
||||||
nlp: Language, factory_name: str
|
|
||||||
) -> Set[str]:
|
|
||||||
pipe_names = [
|
pipe_names = [
|
||||||
pipe_name
|
pipe_name
|
||||||
for pipe_name in nlp.pipe_names
|
for pipe_name in nlp.pipe_names
|
||||||
|
@ -714,9 +712,7 @@ def _get_labels_from_model(
|
||||||
return labels
|
return labels
|
||||||
|
|
||||||
|
|
||||||
def _get_labels_from_spancat(
|
def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
|
||||||
nlp: Language
|
|
||||||
) -> Dict[str, Set[str]]:
|
|
||||||
pipe_names = [
|
pipe_names = [
|
||||||
pipe_name
|
pipe_name
|
||||||
for pipe_name in nlp.pipe_names
|
for pipe_name in nlp.pipe_names
|
||||||
|
|
|
@ -6,6 +6,11 @@ can help generate the best possible configuration, given a user's requirements.
|
||||||
[paths]
|
[paths]
|
||||||
train = null
|
train = null
|
||||||
dev = null
|
dev = null
|
||||||
|
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||||
|
vectors = null
|
||||||
|
{% else -%}
|
||||||
|
vectors = "{{ word_vectors }}"
|
||||||
|
{% endif -%}
|
||||||
|
|
||||||
[system]
|
[system]
|
||||||
{% if use_transformer -%}
|
{% if use_transformer -%}
|
||||||
|
@ -421,8 +426,4 @@ compound = 1.001
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
[initialize]
|
[initialize]
|
||||||
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
|
||||||
vectors = ${paths.vectors}
|
vectors = ${paths.vectors}
|
||||||
{% else -%}
|
|
||||||
vectors = "{{ word_vectors }}"
|
|
||||||
{% endif -%}
|
|
||||||
|
|
|
@ -90,7 +90,7 @@ _eleven_to_beyond = [
|
||||||
"अड़सठ",
|
"अड़सठ",
|
||||||
"उनहत्तर",
|
"उनहत्तर",
|
||||||
"सत्तर",
|
"सत्तर",
|
||||||
"इकहत्तर"
|
"इकहत्तर",
|
||||||
"बहत्तर",
|
"बहत्तर",
|
||||||
"तिहत्तर",
|
"तिहत्तर",
|
||||||
"चौहत्तर",
|
"चौहत्तर",
|
||||||
|
|
|
@ -59,7 +59,7 @@ sentences = [
|
||||||
"Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
|
"Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
|
||||||
"Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.",
|
"Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.",
|
||||||
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares.",
|
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares.",
|
||||||
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes.."
|
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes..",
|
||||||
"São Francisco considera banir os robôs de entrega que andam pelas calçadas.",
|
"São Francisco considera banir os robôs de entrega que andam pelas calçadas.",
|
||||||
"Londres é a maior cidade do Reino Unido.",
|
"Londres é a maior cidade do Reino Unido.",
|
||||||
# Translations from English:
|
# Translations from English:
|
||||||
|
|
|
@ -354,12 +354,15 @@ class Language:
|
||||||
@property
|
@property
|
||||||
def pipe_labels(self) -> Dict[str, List[str]]:
|
def pipe_labels(self) -> Dict[str, List[str]]:
|
||||||
"""Get the labels set by the pipeline components, if available (if
|
"""Get the labels set by the pipeline components, if available (if
|
||||||
the component exposes a labels property).
|
the component exposes a labels property and the labels are not
|
||||||
|
hidden).
|
||||||
|
|
||||||
RETURNS (Dict[str, List[str]]): Labels keyed by component name.
|
RETURNS (Dict[str, List[str]]): Labels keyed by component name.
|
||||||
"""
|
"""
|
||||||
labels = {}
|
labels = {}
|
||||||
for name, pipe in self._components:
|
for name, pipe in self._components:
|
||||||
|
if hasattr(pipe, "hide_labels") and pipe.hide_labels is True:
|
||||||
|
continue
|
||||||
if hasattr(pipe, "labels"):
|
if hasattr(pipe, "labels"):
|
||||||
labels[name] = list(pipe.labels)
|
labels[name] = list(pipe.labels)
|
||||||
return SimpleFrozenDict(labels)
|
return SimpleFrozenDict(labels)
|
||||||
|
@ -522,7 +525,7 @@ class Language:
|
||||||
requires: Iterable[str] = SimpleFrozenList(),
|
requires: Iterable[str] = SimpleFrozenList(),
|
||||||
retokenizes: bool = False,
|
retokenizes: bool = False,
|
||||||
func: Optional["Pipe"] = None,
|
func: Optional["Pipe"] = None,
|
||||||
) -> Callable:
|
) -> Callable[..., Any]:
|
||||||
"""Register a new pipeline component. Can be used for stateless function
|
"""Register a new pipeline component. Can be used for stateless function
|
||||||
components that don't require a separate factory. Can be used as a
|
components that don't require a separate factory. Can be used as a
|
||||||
decorator on a function or classmethod, or called as a function with the
|
decorator on a function or classmethod, or called as a function with the
|
||||||
|
|
66
spacy/matcher/dependencymatcher.pyi
Normal file
66
spacy/matcher/dependencymatcher.pyi
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||||
|
from .matcher import Matcher
|
||||||
|
from ..vocab import Vocab
|
||||||
|
from ..tokens.doc import Doc
|
||||||
|
from ..tokens.span import Span
|
||||||
|
|
||||||
|
class DependencyMatcher:
|
||||||
|
"""Match dependency parse tree based on pattern rules."""
|
||||||
|
|
||||||
|
_patterns: Dict[str, List[Any]]
|
||||||
|
_raw_patterns: Dict[str, List[Any]]
|
||||||
|
_tokens_to_key: Dict[str, List[Any]]
|
||||||
|
_root: Dict[str, List[Any]]
|
||||||
|
_tree: Dict[str, List[Any]]
|
||||||
|
_callbacks: Dict[
|
||||||
|
Any, Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
|
||||||
|
]
|
||||||
|
_ops: Dict[str, Any]
|
||||||
|
vocab: Vocab
|
||||||
|
_matcher: Matcher
|
||||||
|
def __init__(self, vocab: Vocab, *, validate: bool = ...) -> None: ...
|
||||||
|
def __reduce__(
|
||||||
|
self,
|
||||||
|
) -> Tuple[
|
||||||
|
Callable[
|
||||||
|
[Vocab, Dict[str, Any], Dict[str, Callable[..., Any]]], DependencyMatcher
|
||||||
|
],
|
||||||
|
Tuple[
|
||||||
|
Vocab,
|
||||||
|
Dict[str, List[Any]],
|
||||||
|
Dict[
|
||||||
|
str,
|
||||||
|
Callable[
|
||||||
|
[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any
|
||||||
|
],
|
||||||
|
],
|
||||||
|
],
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
]: ...
|
||||||
|
def __len__(self) -> int: ...
|
||||||
|
def __contains__(self, key: Union[str, int]) -> bool: ...
|
||||||
|
def add(
|
||||||
|
self,
|
||||||
|
key: Union[str, int],
|
||||||
|
patterns: List[List[Dict[str, Any]]],
|
||||||
|
*,
|
||||||
|
on_match: Optional[
|
||||||
|
Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
|
||||||
|
] = ...
|
||||||
|
) -> None: ...
|
||||||
|
def has_key(self, key: Union[str, int]) -> bool: ...
|
||||||
|
def get(
|
||||||
|
self, key: Union[str, int], default: Optional[Any] = ...
|
||||||
|
) -> Tuple[
|
||||||
|
Optional[
|
||||||
|
Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
|
||||||
|
],
|
||||||
|
List[List[Dict[str, Any]]],
|
||||||
|
]: ...
|
||||||
|
def remove(self, key: Union[str, int]) -> None: ...
|
||||||
|
def __call__(self, doclike: Union[Doc, Span]) -> List[Tuple[int, List[int]]]: ...
|
||||||
|
|
||||||
|
def unpickle_matcher(
|
||||||
|
vocab: Vocab, patterns: Dict[str, Any], callbacks: Dict[str, Callable[..., Any]]
|
||||||
|
) -> DependencyMatcher: ...
|
|
@ -1,4 +1,6 @@
|
||||||
from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Iterator, Iterable
|
from typing import Any, List, Dict, Tuple, Optional, Callable, Union
|
||||||
|
from typing import Iterator, Iterable, overload
|
||||||
|
from ..compat import Literal
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
|
|
||||||
|
@ -31,12 +33,22 @@ class Matcher:
|
||||||
) -> Union[
|
) -> Union[
|
||||||
Iterator[Tuple[Tuple[Doc, Any], Any]], Iterator[Tuple[Doc, Any]], Iterator[Doc]
|
Iterator[Tuple[Tuple[Doc, Any], Any]], Iterator[Tuple[Doc, Any]], Iterator[Doc]
|
||||||
]: ...
|
]: ...
|
||||||
|
@overload
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
doclike: Union[Doc, Span],
|
doclike: Union[Doc, Span],
|
||||||
*,
|
*,
|
||||||
as_spans: bool = ...,
|
as_spans: Literal[False] = ...,
|
||||||
allow_missing: bool = ...,
|
allow_missing: bool = ...,
|
||||||
with_alignments: bool = ...
|
with_alignments: bool = ...
|
||||||
) -> Union[List[Tuple[int, int, int]], List[Span]]: ...
|
) -> List[Tuple[int, int, int]]: ...
|
||||||
|
@overload
|
||||||
|
def __call__(
|
||||||
|
self,
|
||||||
|
doclike: Union[Doc, Span],
|
||||||
|
*,
|
||||||
|
as_spans: Literal[True],
|
||||||
|
allow_missing: bool = ...,
|
||||||
|
with_alignments: bool = ...
|
||||||
|
) -> List[Span]: ...
|
||||||
def _normalize_key(self, key: Any) -> Any: ...
|
def _normalize_key(self, key: Any) -> Any: ...
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from typing import List, Tuple, Union, Optional, Callable, Any, Dict
|
from typing import List, Tuple, Union, Optional, Callable, Any, Dict, overload
|
||||||
|
from ..compat import Literal
|
||||||
from . import Matcher
|
from .matcher import Matcher
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
|
|
||||||
|
@ -21,9 +21,17 @@ class PhraseMatcher:
|
||||||
] = ...,
|
] = ...,
|
||||||
) -> None: ...
|
) -> None: ...
|
||||||
def remove(self, key: str) -> None: ...
|
def remove(self, key: str) -> None: ...
|
||||||
|
@overload
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
doclike: Union[Doc, Span],
|
doclike: Union[Doc, Span],
|
||||||
*,
|
*,
|
||||||
as_spans: bool = ...,
|
as_spans: Literal[False] = ...,
|
||||||
) -> Union[List[Tuple[int, int, int]], List[Span]]: ...
|
) -> List[Tuple[int, int, int]]: ...
|
||||||
|
@overload
|
||||||
|
def __call__(
|
||||||
|
self,
|
||||||
|
doclike: Union[Doc, Span],
|
||||||
|
*,
|
||||||
|
as_spans: Literal[True],
|
||||||
|
) -> List[Span]: ...
|
||||||
|
|
|
@ -26,6 +26,8 @@ class Pipe:
|
||||||
@property
|
@property
|
||||||
def labels(self) -> Tuple[str, ...]: ...
|
def labels(self) -> Tuple[str, ...]: ...
|
||||||
@property
|
@property
|
||||||
|
def hide_labels(self) -> bool: ...
|
||||||
|
@property
|
||||||
def label_data(self) -> Any: ...
|
def label_data(self) -> Any: ...
|
||||||
def _require_labels(self) -> None: ...
|
def _require_labels(self) -> None: ...
|
||||||
def set_error_handler(
|
def set_error_handler(
|
||||||
|
|
|
@ -102,6 +102,10 @@ cdef class Pipe:
|
||||||
def labels(self) -> Tuple[str, ...]:
|
def labels(self) -> Tuple[str, ...]:
|
||||||
return tuple()
|
return tuple()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def hide_labels(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def label_data(self):
|
def label_data(self):
|
||||||
"""Optional JSON-serializable data that would be sufficient to recreate
|
"""Optional JSON-serializable data that would be sufficient to recreate
|
||||||
|
|
|
@ -99,6 +99,10 @@ class SentenceRecognizer(Tagger):
|
||||||
# are 0
|
# are 0
|
||||||
return tuple(["I", "S"])
|
return tuple(["I", "S"])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def hide_labels(self):
|
||||||
|
return True
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def label_data(self):
|
def label_data(self):
|
||||||
return None
|
return None
|
||||||
|
|
|
@ -413,7 +413,7 @@ class SpanCategorizer(TrainablePipe):
|
||||||
self._require_labels()
|
self._require_labels()
|
||||||
if subbatch:
|
if subbatch:
|
||||||
docs = [eg.x for eg in subbatch]
|
docs = [eg.x for eg in subbatch]
|
||||||
spans = self.suggester(docs)
|
spans = build_ngram_suggester(sizes=[1])(docs)
|
||||||
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels))
|
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels))
|
||||||
self.model.initialize(X=(docs, spans), Y=Y)
|
self.model.initialize(X=(docs, spans), Y=Y)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -97,3 +97,7 @@ def test_overfitting_IO():
|
||||||
]
|
]
|
||||||
assert_equal(batch_deps_1, batch_deps_2)
|
assert_equal(batch_deps_1, batch_deps_2)
|
||||||
assert_equal(batch_deps_1, no_batch_deps)
|
assert_equal(batch_deps_1, no_batch_deps)
|
||||||
|
|
||||||
|
# test internal pipe labels vs. Language.pipe_labels with hidden labels
|
||||||
|
assert nlp.get_pipe("senter").labels == ("I", "S")
|
||||||
|
assert "senter" not in nlp.pipe_labels
|
||||||
|
|
|
@ -79,6 +79,7 @@ def test_explicit_labels():
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
assert spancat.labels == ("PERSON", "LOC")
|
assert spancat.labels == ("PERSON", "LOC")
|
||||||
|
|
||||||
|
|
||||||
# TODO figure out why this is flaky
|
# TODO figure out why this is flaky
|
||||||
@pytest.mark.skip(reason="Test is unreliable for unknown reason")
|
@pytest.mark.skip(reason="Test is unreliable for unknown reason")
|
||||||
def test_doc_gc():
|
def test_doc_gc():
|
||||||
|
|
|
@ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path
|
from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path
|
||||||
|
from spacy.util import compile_infix_regex
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.symbols import ORTH
|
from spacy.symbols import ORTH
|
||||||
|
|
||||||
|
@ -503,3 +504,20 @@ def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
|
||||||
assert tokens == ["a", "10", "."]
|
assert tokens == ["a", "10", "."]
|
||||||
explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
|
explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
|
||||||
assert tokens == explain_tokens
|
assert tokens == explain_tokens
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_infix_prefix(en_vocab):
|
||||||
|
# the prefix and suffix matches overlap in the suffix lookbehind
|
||||||
|
infixes = ["±"]
|
||||||
|
suffixes = ["%"]
|
||||||
|
infix_re = compile_infix_regex(infixes)
|
||||||
|
suffix_re = compile_suffix_regex(suffixes)
|
||||||
|
tokenizer = Tokenizer(
|
||||||
|
en_vocab,
|
||||||
|
infix_finditer=infix_re.finditer,
|
||||||
|
suffix_search=suffix_re.search,
|
||||||
|
)
|
||||||
|
tokens = [t.text for t in tokenizer("±10%")]
|
||||||
|
assert tokens == ["±10", "%"]
|
||||||
|
explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
|
||||||
|
assert tokens == explain_tokens
|
||||||
|
|
|
@ -683,6 +683,8 @@ cdef class Tokenizer:
|
||||||
infixes = infix_finditer(substring)
|
infixes = infix_finditer(substring)
|
||||||
offset = 0
|
offset = 0
|
||||||
for match in infixes:
|
for match in infixes:
|
||||||
|
if offset == 0 and match.start() == 0:
|
||||||
|
continue
|
||||||
if substring[offset : match.start()]:
|
if substring[offset : match.start()]:
|
||||||
tokens.append(("TOKEN", substring[offset : match.start()]))
|
tokens.append(("TOKEN", substring[offset : match.start()]))
|
||||||
if substring[match.start() : match.end()]:
|
if substring[match.start() : match.end()]:
|
||||||
|
|
|
@ -10,7 +10,7 @@ from ..lexeme import Lexeme
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from .underscore import Underscore
|
from .underscore import Underscore
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import numpy
|
import numpy as np
|
||||||
|
|
||||||
class DocMethod(Protocol):
|
class DocMethod(Protocol):
|
||||||
def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
|
def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
|
||||||
|
@ -26,7 +26,7 @@ class Doc:
|
||||||
user_hooks: Dict[str, Callable[..., Any]]
|
user_hooks: Dict[str, Callable[..., Any]]
|
||||||
user_token_hooks: Dict[str, Callable[..., Any]]
|
user_token_hooks: Dict[str, Callable[..., Any]]
|
||||||
user_span_hooks: Dict[str, Callable[..., Any]]
|
user_span_hooks: Dict[str, Callable[..., Any]]
|
||||||
tensor: numpy.ndarray
|
tensor: np.ndarray[Any, np.dtype[np.float_]]
|
||||||
user_data: Dict[str, Any]
|
user_data: Dict[str, Any]
|
||||||
has_unknown_spaces: bool
|
has_unknown_spaces: bool
|
||||||
_context: Any
|
_context: Any
|
||||||
|
@ -144,7 +144,7 @@ class Doc:
|
||||||
) -> Doc: ...
|
) -> Doc: ...
|
||||||
def to_array(
|
def to_array(
|
||||||
self, py_attr_ids: Union[int, str, List[Union[int, str]]]
|
self, py_attr_ids: Union[int, str, List[Union[int, str]]]
|
||||||
) -> numpy.ndarray: ...
|
) -> np.ndarray[Any, np.dtype[np.float_]]: ...
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_docs(
|
def from_docs(
|
||||||
docs: List[Doc],
|
docs: List[Doc],
|
||||||
|
|
|
@ -459,8 +459,8 @@ cdef class Span:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ents(self):
|
def ents(self):
|
||||||
"""The named entities in the span. Returns a tuple of named entity
|
"""The named entities that fall completely within the span. Returns
|
||||||
`Span` objects, if the entity recognizer has been applied.
|
a tuple of `Span` objects.
|
||||||
|
|
||||||
RETURNS (tuple): Entities in the span, one `Span` per entity.
|
RETURNS (tuple): Entities in the span, one `Span` per entity.
|
||||||
|
|
||||||
|
|
|
@ -1,17 +1,31 @@
|
||||||
from typing import Dict, Any
|
from typing import Dict, Any, List, Optional, Tuple, Union, TYPE_CHECKING
|
||||||
import functools
|
import functools
|
||||||
import copy
|
import copy
|
||||||
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .doc import Doc
|
||||||
|
from .span import Span
|
||||||
|
from .token import Token
|
||||||
|
|
||||||
|
|
||||||
class Underscore:
|
class Underscore:
|
||||||
mutable_types = (dict, list, set)
|
mutable_types = (dict, list, set)
|
||||||
doc_extensions: Dict[Any, Any] = {}
|
doc_extensions: Dict[Any, Any] = {}
|
||||||
span_extensions: Dict[Any, Any] = {}
|
span_extensions: Dict[Any, Any] = {}
|
||||||
token_extensions: Dict[Any, Any] = {}
|
token_extensions: Dict[Any, Any] = {}
|
||||||
|
_extensions: Dict[str, Any]
|
||||||
|
_obj: Union["Doc", "Span", "Token"]
|
||||||
|
_start: Optional[int]
|
||||||
|
_end: Optional[int]
|
||||||
|
|
||||||
def __init__(self, extensions, obj, start=None, end=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
extensions: Dict[str, Any],
|
||||||
|
obj: Union["Doc", "Span", "Token"],
|
||||||
|
start: Optional[int] = None,
|
||||||
|
end: Optional[int] = None,
|
||||||
|
):
|
||||||
object.__setattr__(self, "_extensions", extensions)
|
object.__setattr__(self, "_extensions", extensions)
|
||||||
object.__setattr__(self, "_obj", obj)
|
object.__setattr__(self, "_obj", obj)
|
||||||
# Assumption is that for doc values, _start and _end will both be None
|
# Assumption is that for doc values, _start and _end will both be None
|
||||||
|
@ -23,12 +37,12 @@ class Underscore:
|
||||||
object.__setattr__(self, "_start", start)
|
object.__setattr__(self, "_start", start)
|
||||||
object.__setattr__(self, "_end", end)
|
object.__setattr__(self, "_end", end)
|
||||||
|
|
||||||
def __dir__(self):
|
def __dir__(self) -> List[str]:
|
||||||
# Hack to enable autocomplete on custom extensions
|
# Hack to enable autocomplete on custom extensions
|
||||||
extensions = list(self._extensions.keys())
|
extensions = list(self._extensions.keys())
|
||||||
return ["set", "get", "has"] + extensions
|
return ["set", "get", "has"] + extensions
|
||||||
|
|
||||||
def __getattr__(self, name):
|
def __getattr__(self, name: str) -> Any:
|
||||||
if name not in self._extensions:
|
if name not in self._extensions:
|
||||||
raise AttributeError(Errors.E046.format(name=name))
|
raise AttributeError(Errors.E046.format(name=name))
|
||||||
default, method, getter, setter = self._extensions[name]
|
default, method, getter, setter = self._extensions[name]
|
||||||
|
@ -56,7 +70,7 @@ class Underscore:
|
||||||
return new_default
|
return new_default
|
||||||
return default
|
return default
|
||||||
|
|
||||||
def __setattr__(self, name, value):
|
def __setattr__(self, name: str, value: Any):
|
||||||
if name not in self._extensions:
|
if name not in self._extensions:
|
||||||
raise AttributeError(Errors.E047.format(name=name))
|
raise AttributeError(Errors.E047.format(name=name))
|
||||||
default, method, getter, setter = self._extensions[name]
|
default, method, getter, setter = self._extensions[name]
|
||||||
|
@ -65,28 +79,30 @@ class Underscore:
|
||||||
else:
|
else:
|
||||||
self._doc.user_data[self._get_key(name)] = value
|
self._doc.user_data[self._get_key(name)] = value
|
||||||
|
|
||||||
def set(self, name, value):
|
def set(self, name: str, value: Any):
|
||||||
return self.__setattr__(name, value)
|
return self.__setattr__(name, value)
|
||||||
|
|
||||||
def get(self, name):
|
def get(self, name: str) -> Any:
|
||||||
return self.__getattr__(name)
|
return self.__getattr__(name)
|
||||||
|
|
||||||
def has(self, name):
|
def has(self, name: str) -> bool:
|
||||||
return name in self._extensions
|
return name in self._extensions
|
||||||
|
|
||||||
def _get_key(self, name):
|
def _get_key(self, name: str) -> Tuple[str, str, Optional[int], Optional[int]]:
|
||||||
return ("._.", name, self._start, self._end)
|
return ("._.", name, self._start, self._end)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_state(cls):
|
def get_state(cls) -> Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]:
|
||||||
return cls.token_extensions, cls.span_extensions, cls.doc_extensions
|
return cls.token_extensions, cls.span_extensions, cls.doc_extensions
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load_state(cls, state):
|
def load_state(
|
||||||
|
cls, state: Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]
|
||||||
|
) -> None:
|
||||||
cls.token_extensions, cls.span_extensions, cls.doc_extensions = state
|
cls.token_extensions, cls.span_extensions, cls.doc_extensions = state
|
||||||
|
|
||||||
|
|
||||||
def get_ext_args(**kwargs):
|
def get_ext_args(**kwargs: Any):
|
||||||
"""Validate and convert arguments. Reused in Doc, Token and Span."""
|
"""Validate and convert arguments. Reused in Doc, Token and Span."""
|
||||||
default = kwargs.get("default")
|
default = kwargs.get("default")
|
||||||
getter = kwargs.get("getter")
|
getter = kwargs.get("getter")
|
||||||
|
|
16
website/Dockerfile
Normal file
16
website/Dockerfile
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
FROM node:11.15.0
|
||||||
|
|
||||||
|
WORKDIR /spacy-io
|
||||||
|
|
||||||
|
RUN npm install -g gatsby-cli@2.7.4
|
||||||
|
|
||||||
|
COPY package.json .
|
||||||
|
COPY package-lock.json .
|
||||||
|
|
||||||
|
RUN npm install
|
||||||
|
|
||||||
|
# This is so the installed node_modules will be up one directory
|
||||||
|
# from where a user mounts files, so that they don't accidentally mount
|
||||||
|
# their own node_modules from a different build
|
||||||
|
# https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
|
||||||
|
WORKDIR /spacy-io/website/
|
|
@ -554,6 +554,42 @@ extensions for your code editor. The
|
||||||
[`.prettierrc`](https://github.com/explosion/spaCy/tree/master/website/.prettierrc)
|
[`.prettierrc`](https://github.com/explosion/spaCy/tree/master/website/.prettierrc)
|
||||||
file in the root defines the settings used in this codebase.
|
file in the root defines the settings used in this codebase.
|
||||||
|
|
||||||
|
## Building & developing the site with Docker {#docker}
|
||||||
|
Sometimes it's hard to get a local environment working due to rapid updates to node dependencies,
|
||||||
|
so it may be easier to use docker for building the docs.
|
||||||
|
|
||||||
|
If you'd like to do this,
|
||||||
|
**be sure you do *not* include your local `node_modules` folder**,
|
||||||
|
since there are some dependencies that need to be built for the image system.
|
||||||
|
Rename it before using.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -it \
|
||||||
|
-v $(pwd):/spacy-io/website \
|
||||||
|
-p 8000:8000 \
|
||||||
|
ghcr.io/explosion/spacy-io \
|
||||||
|
gatsby develop -H 0.0.0.0
|
||||||
|
```
|
||||||
|
|
||||||
|
This will allow you to access the built website at http://0.0.0.0:8000/
|
||||||
|
in your browser, and still edit code in your editor while having the site
|
||||||
|
reflect those changes.
|
||||||
|
|
||||||
|
**Note**: If you're working on a Mac with an M1 processor,
|
||||||
|
you might see segfault errors from `qemu` if you use the default image.
|
||||||
|
To fix this use the `arm64` tagged image in the `docker run` command
|
||||||
|
(ghcr.io/explosion/spacy-io:arm64).
|
||||||
|
|
||||||
|
### Building the Docker image {#docker-build}
|
||||||
|
|
||||||
|
If you'd like to build the image locally, you can do so like this:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker build -t spacy-io .
|
||||||
|
```
|
||||||
|
|
||||||
|
This will take some time, so if you want to use the prebuilt image you'll save a bit of time.
|
||||||
|
|
||||||
## Markdown reference {#markdown}
|
## Markdown reference {#markdown}
|
||||||
|
|
||||||
All page content and page meta lives in the `.md` files in the `/docs`
|
All page content and page meta lives in the `.md` files in the `/docs`
|
||||||
|
|
|
@ -257,8 +257,8 @@ shape `(N, M)`, where `N` is the length of the document. The values will be
|
||||||
|
|
||||||
## Span.ents {#ents tag="property" new="2.0.13" model="ner"}
|
## Span.ents {#ents tag="property" new="2.0.13" model="ner"}
|
||||||
|
|
||||||
The named entities in the span. Returns a tuple of named entity `Span` objects,
|
The named entities that fall completely within the span. Returns a tuple of
|
||||||
if the entity recognizer has been applied.
|
`Span` objects.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
|
|
@ -831,6 +831,8 @@ def tokenizer_pseudo_code(
|
||||||
infixes = infix_finditer(substring)
|
infixes = infix_finditer(substring)
|
||||||
offset = 0
|
offset = 0
|
||||||
for match in infixes:
|
for match in infixes:
|
||||||
|
if offset == 0 and match.start() == 0:
|
||||||
|
continue
|
||||||
tokens.append(substring[offset : match.start()])
|
tokens.append(substring[offset : match.start()])
|
||||||
tokens.append(substring[match.start() : match.end()])
|
tokens.append(substring[match.start() : match.end()])
|
||||||
offset = match.end()
|
offset = match.end()
|
||||||
|
|
|
@ -141,7 +141,8 @@
|
||||||
"website": "https://www.nr.no/~plison"
|
"website": "https://www.nr.no/~plison"
|
||||||
},
|
},
|
||||||
"category": ["pipeline", "standalone", "research", "training"],
|
"category": ["pipeline", "standalone", "research", "training"],
|
||||||
"tags": []
|
"tags": [],
|
||||||
|
"spacy_version": 3
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": "numerizer",
|
"id": "numerizer",
|
||||||
|
@ -977,6 +978,48 @@
|
||||||
"category": ["pipeline"],
|
"category": ["pipeline"],
|
||||||
"tags": ["pipeline", "danish"]
|
"tags": ["pipeline", "danish"]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"id": "spacy-wrap",
|
||||||
|
"title": "spaCy-wrap",
|
||||||
|
"slogan": "For Wrapping fine-tuned transformers in spaCy pipelines",
|
||||||
|
"description": "spaCy-wrap is a wrapper library for spaCy for including fine-tuned transformers from Huggingface in your spaCy pipeline allowing inclusion of existing models within existing workflows.",
|
||||||
|
"github": "kennethenevoldsen/spacy-wrap",
|
||||||
|
"pip": "spacy_wrap",
|
||||||
|
"code_example": [
|
||||||
|
"import spacy",
|
||||||
|
"import spacy_wrap",
|
||||||
|
"",
|
||||||
|
"nlp = spacy.blank('en')",
|
||||||
|
"config = {",
|
||||||
|
" 'doc_extension_trf_data': 'clf_trf_data', # document extention for the forward pass",
|
||||||
|
" 'doc_extension_prediction': 'sentiment', # document extention for the prediction",
|
||||||
|
" 'labels': ['negative', 'neutral', 'positive'],",
|
||||||
|
" 'model': {",
|
||||||
|
" 'name': 'cardiffnlp/twitter-roberta-base-sentiment', # the model name or path of huggingface model",
|
||||||
|
"},",
|
||||||
|
"}",
|
||||||
|
"",
|
||||||
|
"transformer = nlp.add_pipe('classification_transformer', config=config)",
|
||||||
|
"transformer.model.initialize()",
|
||||||
|
"",
|
||||||
|
"doc = nlp('spaCy is a wonderful tool')",
|
||||||
|
"",
|
||||||
|
"print(doc._.clf_trf_data)",
|
||||||
|
"# TransformerData(wordpieces=...",
|
||||||
|
"print(doc._.sentiment)",
|
||||||
|
"# 'positive'",
|
||||||
|
"print(doc._.sentiment_prob)",
|
||||||
|
"# {'prob': array([0.004, 0.028, 0.969], dtype=float32), 'labels': ['negative', 'neutral', 'positive']}"
|
||||||
|
],
|
||||||
|
"thumb": "https://raw.githubusercontent.com/KennethEnevoldsen/spacy-wrap/main/docs/_static/icon.png",
|
||||||
|
"author": "Kenneth Enevoldsen",
|
||||||
|
"author_links": {
|
||||||
|
"github": "KennethEnevoldsen",
|
||||||
|
"website": "https://www.kennethenevoldsen.com"
|
||||||
|
},
|
||||||
|
"category": ["pipeline", "models", "training"],
|
||||||
|
"tags": ["pipeline", "models", "transformers"]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": "textdescriptives",
|
"id": "textdescriptives",
|
||||||
"title": "TextDescriptives",
|
"title": "TextDescriptives",
|
||||||
|
|
|
@ -8,10 +8,11 @@ import Title from '../components/title'
|
||||||
import Grid from '../components/grid'
|
import Grid from '../components/grid'
|
||||||
import Button from '../components/button'
|
import Button from '../components/button'
|
||||||
import Icon from '../components/icon'
|
import Icon from '../components/icon'
|
||||||
|
import Tag from '../components/tag'
|
||||||
import CodeBlock, { InlineCode } from '../components/code'
|
import CodeBlock, { InlineCode } from '../components/code'
|
||||||
import Aside from '../components/aside'
|
import Aside from '../components/aside'
|
||||||
import Sidebar from '../components/sidebar'
|
import Sidebar from '../components/sidebar'
|
||||||
import Section from '../components/section'
|
import Section, { Hr } from '../components/section'
|
||||||
import Main from '../components/main'
|
import Main from '../components/main'
|
||||||
import Footer from '../components/footer'
|
import Footer from '../components/footer'
|
||||||
import { H3, H5, Label, InlineList } from '../components/typography'
|
import { H3, H5, Label, InlineList } from '../components/typography'
|
||||||
|
@ -121,6 +122,18 @@ const UniverseContent = ({ content = [], categories, theme, pageContext, mdxComp
|
||||||
</Grid>
|
</Grid>
|
||||||
</Section>
|
</Section>
|
||||||
)}
|
)}
|
||||||
|
<section className="search-exclude">
|
||||||
|
<H3>Found a mistake or something isn't working?</H3>
|
||||||
|
<p>
|
||||||
|
If you've come across a universe project that isn't working or is
|
||||||
|
incompatible with the reported spaCy version, let us know by{' '}
|
||||||
|
<Link to="https://github.com/explosion/spaCy/discussions/new">
|
||||||
|
opening a discussion thread
|
||||||
|
</Link>
|
||||||
|
.
|
||||||
|
</p>
|
||||||
|
</section>
|
||||||
|
<Hr />
|
||||||
<section className="search-exclude">
|
<section className="search-exclude">
|
||||||
<H3>Submit your project</H3>
|
<H3>Submit your project</H3>
|
||||||
<p>
|
<p>
|
||||||
|
@ -168,11 +181,22 @@ UniverseContent.propTypes = {
|
||||||
mdxComponents: PropTypes.object,
|
mdxComponents: PropTypes.object,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const SpaCyVersion = ({ version }) => {
|
||||||
|
const versions = !Array.isArray(version) ? [version] : version
|
||||||
|
return versions.map((v, i) => (
|
||||||
|
<>
|
||||||
|
<Tag tooltip={`This project is compatible with spaCy v${v}`}>spaCy v{v}</Tag>{' '}
|
||||||
|
</>
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
const Project = ({ data, components }) => (
|
const Project = ({ data, components }) => (
|
||||||
<>
|
<>
|
||||||
<Title title={data.title || data.id} teaser={data.slogan} image={data.thumb}>
|
<Title title={data.title || data.id} teaser={data.slogan} image={data.thumb}>
|
||||||
{data.github && (
|
{(data.github || data.spacy_version) && (
|
||||||
<p>
|
<p>
|
||||||
|
{data.spacy_version && <SpaCyVersion version={data.spacy_version} />}
|
||||||
|
{data.github && (
|
||||||
<Link to={`https://github.com/${data.github}`} hidden>
|
<Link to={`https://github.com/${data.github}`} hidden>
|
||||||
{[
|
{[
|
||||||
`release/${data.github}/all.svg?style=flat-square`,
|
`release/${data.github}/all.svg?style=flat-square`,
|
||||||
|
@ -180,13 +204,18 @@ const Project = ({ data, components }) => (
|
||||||
`stars/${data.github}.svg?style=social&label=Stars`,
|
`stars/${data.github}.svg?style=social&label=Stars`,
|
||||||
].map((url, i) => (
|
].map((url, i) => (
|
||||||
<img
|
<img
|
||||||
style={{ borderRadius: '1em', marginRight: '0.5rem' }}
|
style={{
|
||||||
|
borderRadius: '1em',
|
||||||
|
marginRight: '0.5rem',
|
||||||
|
verticalAlign: 'middle',
|
||||||
|
}}
|
||||||
key={i}
|
key={i}
|
||||||
src={`https://img.shields.io/github/${url}`}
|
src={`https://img.shields.io/github/${url}`}
|
||||||
alt=""
|
alt=""
|
||||||
/>
|
/>
|
||||||
))}
|
))}
|
||||||
</Link>
|
</Link>
|
||||||
|
)}
|
||||||
</p>
|
</p>
|
||||||
)}
|
)}
|
||||||
</Title>
|
</Title>
|
||||||
|
@ -335,6 +364,7 @@ const query = graphql`
|
||||||
url
|
url
|
||||||
github
|
github
|
||||||
description
|
description
|
||||||
|
spacy_version
|
||||||
pip
|
pip
|
||||||
cran
|
cran
|
||||||
category
|
category
|
||||||
|
|
Loading…
Reference in New Issue
Block a user