Merge pull request #10215 from explosion/master

update develop
This commit is contained in:
Sofie Van Landeghem 2022-02-06 13:45:41 +01:00 committed by GitHub
commit 14513f82da
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 325 additions and 61 deletions

View File

@ -699,9 +699,7 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
return count return count
def _get_labels_from_model( def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]:
nlp: Language, factory_name: str
) -> Set[str]:
pipe_names = [ pipe_names = [
pipe_name pipe_name
for pipe_name in nlp.pipe_names for pipe_name in nlp.pipe_names
@ -714,9 +712,7 @@ def _get_labels_from_model(
return labels return labels
def _get_labels_from_spancat( def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
nlp: Language
) -> Dict[str, Set[str]]:
pipe_names = [ pipe_names = [
pipe_name pipe_name
for pipe_name in nlp.pipe_names for pipe_name in nlp.pipe_names

View File

@ -6,6 +6,11 @@ can help generate the best possible configuration, given a user's requirements.
[paths] [paths]
train = null train = null
dev = null dev = null
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = null
{% else -%}
vectors = "{{ word_vectors }}"
{% endif -%}
[system] [system]
{% if use_transformer -%} {% if use_transformer -%}
@ -421,8 +426,4 @@ compound = 1.001
{% endif %} {% endif %}
[initialize] [initialize]
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = ${paths.vectors} vectors = ${paths.vectors}
{% else -%}
vectors = "{{ word_vectors }}"
{% endif -%}

View File

@ -90,7 +90,7 @@ _eleven_to_beyond = [
"अड़सठ", "अड़सठ",
"उनहत्तर", "उनहत्तर",
"सत्तर", "सत्तर",
"इकहत्तर" "इकहत्तर",
"बहत्तर", "बहत्तर",
"तिहत्तर", "तिहत्तर",
"चौहत्तर", "चौहत्तर",

View File

@ -59,7 +59,7 @@ sentences = [
"Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?", "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
"Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.", "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.",
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares.", "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares.",
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes.." "Carros autônomos empurram a responsabilidade do seguro para os fabricantes..",
"São Francisco considera banir os robôs de entrega que andam pelas calçadas.", "São Francisco considera banir os robôs de entrega que andam pelas calçadas.",
"Londres é a maior cidade do Reino Unido.", "Londres é a maior cidade do Reino Unido.",
# Translations from English: # Translations from English:

View File

@ -354,12 +354,15 @@ class Language:
@property @property
def pipe_labels(self) -> Dict[str, List[str]]: def pipe_labels(self) -> Dict[str, List[str]]:
"""Get the labels set by the pipeline components, if available (if """Get the labels set by the pipeline components, if available (if
the component exposes a labels property). the component exposes a labels property and the labels are not
hidden).
RETURNS (Dict[str, List[str]]): Labels keyed by component name. RETURNS (Dict[str, List[str]]): Labels keyed by component name.
""" """
labels = {} labels = {}
for name, pipe in self._components: for name, pipe in self._components:
if hasattr(pipe, "hide_labels") and pipe.hide_labels is True:
continue
if hasattr(pipe, "labels"): if hasattr(pipe, "labels"):
labels[name] = list(pipe.labels) labels[name] = list(pipe.labels)
return SimpleFrozenDict(labels) return SimpleFrozenDict(labels)
@ -522,7 +525,7 @@ class Language:
requires: Iterable[str] = SimpleFrozenList(), requires: Iterable[str] = SimpleFrozenList(),
retokenizes: bool = False, retokenizes: bool = False,
func: Optional["Pipe"] = None, func: Optional["Pipe"] = None,
) -> Callable: ) -> Callable[..., Any]:
"""Register a new pipeline component. Can be used for stateless function """Register a new pipeline component. Can be used for stateless function
components that don't require a separate factory. Can be used as a components that don't require a separate factory. Can be used as a
decorator on a function or classmethod, or called as a function with the decorator on a function or classmethod, or called as a function with the

View File

@ -0,0 +1,66 @@
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from .matcher import Matcher
from ..vocab import Vocab
from ..tokens.doc import Doc
from ..tokens.span import Span
class DependencyMatcher:
"""Match dependency parse tree based on pattern rules."""
_patterns: Dict[str, List[Any]]
_raw_patterns: Dict[str, List[Any]]
_tokens_to_key: Dict[str, List[Any]]
_root: Dict[str, List[Any]]
_tree: Dict[str, List[Any]]
_callbacks: Dict[
Any, Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
]
_ops: Dict[str, Any]
vocab: Vocab
_matcher: Matcher
def __init__(self, vocab: Vocab, *, validate: bool = ...) -> None: ...
def __reduce__(
self,
) -> Tuple[
Callable[
[Vocab, Dict[str, Any], Dict[str, Callable[..., Any]]], DependencyMatcher
],
Tuple[
Vocab,
Dict[str, List[Any]],
Dict[
str,
Callable[
[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any
],
],
],
None,
None,
]: ...
def __len__(self) -> int: ...
def __contains__(self, key: Union[str, int]) -> bool: ...
def add(
self,
key: Union[str, int],
patterns: List[List[Dict[str, Any]]],
*,
on_match: Optional[
Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
] = ...
) -> None: ...
def has_key(self, key: Union[str, int]) -> bool: ...
def get(
self, key: Union[str, int], default: Optional[Any] = ...
) -> Tuple[
Optional[
Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
],
List[List[Dict[str, Any]]],
]: ...
def remove(self, key: Union[str, int]) -> None: ...
def __call__(self, doclike: Union[Doc, Span]) -> List[Tuple[int, List[int]]]: ...
def unpickle_matcher(
vocab: Vocab, patterns: Dict[str, Any], callbacks: Dict[str, Callable[..., Any]]
) -> DependencyMatcher: ...

View File

@ -1,4 +1,6 @@
from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Iterator, Iterable from typing import Any, List, Dict, Tuple, Optional, Callable, Union
from typing import Iterator, Iterable, overload
from ..compat import Literal
from ..vocab import Vocab from ..vocab import Vocab
from ..tokens import Doc, Span from ..tokens import Doc, Span
@ -31,12 +33,22 @@ class Matcher:
) -> Union[ ) -> Union[
Iterator[Tuple[Tuple[Doc, Any], Any]], Iterator[Tuple[Doc, Any]], Iterator[Doc] Iterator[Tuple[Tuple[Doc, Any], Any]], Iterator[Tuple[Doc, Any]], Iterator[Doc]
]: ... ]: ...
@overload
def __call__( def __call__(
self, self,
doclike: Union[Doc, Span], doclike: Union[Doc, Span],
*, *,
as_spans: bool = ..., as_spans: Literal[False] = ...,
allow_missing: bool = ..., allow_missing: bool = ...,
with_alignments: bool = ... with_alignments: bool = ...
) -> Union[List[Tuple[int, int, int]], List[Span]]: ... ) -> List[Tuple[int, int, int]]: ...
@overload
def __call__(
self,
doclike: Union[Doc, Span],
*,
as_spans: Literal[True],
allow_missing: bool = ...,
with_alignments: bool = ...
) -> List[Span]: ...
def _normalize_key(self, key: Any) -> Any: ... def _normalize_key(self, key: Any) -> Any: ...

View File

@ -1,6 +1,6 @@
from typing import List, Tuple, Union, Optional, Callable, Any, Dict from typing import List, Tuple, Union, Optional, Callable, Any, Dict, overload
from ..compat import Literal
from . import Matcher from .matcher import Matcher
from ..vocab import Vocab from ..vocab import Vocab
from ..tokens import Doc, Span from ..tokens import Doc, Span
@ -21,9 +21,17 @@ class PhraseMatcher:
] = ..., ] = ...,
) -> None: ... ) -> None: ...
def remove(self, key: str) -> None: ... def remove(self, key: str) -> None: ...
@overload
def __call__( def __call__(
self, self,
doclike: Union[Doc, Span], doclike: Union[Doc, Span],
*, *,
as_spans: bool = ..., as_spans: Literal[False] = ...,
) -> Union[List[Tuple[int, int, int]], List[Span]]: ... ) -> List[Tuple[int, int, int]]: ...
@overload
def __call__(
self,
doclike: Union[Doc, Span],
*,
as_spans: Literal[True],
) -> List[Span]: ...

View File

@ -26,6 +26,8 @@ class Pipe:
@property @property
def labels(self) -> Tuple[str, ...]: ... def labels(self) -> Tuple[str, ...]: ...
@property @property
def hide_labels(self) -> bool: ...
@property
def label_data(self) -> Any: ... def label_data(self) -> Any: ...
def _require_labels(self) -> None: ... def _require_labels(self) -> None: ...
def set_error_handler( def set_error_handler(

View File

@ -102,6 +102,10 @@ cdef class Pipe:
def labels(self) -> Tuple[str, ...]: def labels(self) -> Tuple[str, ...]:
return tuple() return tuple()
@property
def hide_labels(self) -> bool:
return False
@property @property
def label_data(self): def label_data(self):
"""Optional JSON-serializable data that would be sufficient to recreate """Optional JSON-serializable data that would be sufficient to recreate

View File

@ -99,6 +99,10 @@ class SentenceRecognizer(Tagger):
# are 0 # are 0
return tuple(["I", "S"]) return tuple(["I", "S"])
@property
def hide_labels(self):
return True
@property @property
def label_data(self): def label_data(self):
return None return None

View File

@ -413,7 +413,7 @@ class SpanCategorizer(TrainablePipe):
self._require_labels() self._require_labels()
if subbatch: if subbatch:
docs = [eg.x for eg in subbatch] docs = [eg.x for eg in subbatch]
spans = self.suggester(docs) spans = build_ngram_suggester(sizes=[1])(docs)
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels)) Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels))
self.model.initialize(X=(docs, spans), Y=Y) self.model.initialize(X=(docs, spans), Y=Y)
else: else:

View File

@ -97,3 +97,7 @@ def test_overfitting_IO():
] ]
assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, batch_deps_2)
assert_equal(batch_deps_1, no_batch_deps) assert_equal(batch_deps_1, no_batch_deps)
# test internal pipe labels vs. Language.pipe_labels with hidden labels
assert nlp.get_pipe("senter").labels == ("I", "S")
assert "senter" not in nlp.pipe_labels

View File

@ -79,7 +79,8 @@ def test_explicit_labels():
nlp.initialize() nlp.initialize()
assert spancat.labels == ("PERSON", "LOC") assert spancat.labels == ("PERSON", "LOC")
#TODO figure out why this is flaky
# TODO figure out why this is flaky
@pytest.mark.skip(reason="Test is unreliable for unknown reason") @pytest.mark.skip(reason="Test is unreliable for unknown reason")
def test_doc_gc(): def test_doc_gc():
# If the Doc object is garbage collected, the spans won't be functional afterwards # If the Doc object is garbage collected, the spans won't be functional afterwards

View File

@ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.training import Example from spacy.training import Example
from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path
from spacy.util import compile_infix_regex
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.symbols import ORTH from spacy.symbols import ORTH
@ -503,3 +504,20 @@ def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
assert tokens == ["a", "10", "."] assert tokens == ["a", "10", "."]
explain_tokens = [t[1] for t in tokenizer.explain("a10.")] explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
assert tokens == explain_tokens assert tokens == explain_tokens
def test_tokenizer_infix_prefix(en_vocab):
# the prefix and suffix matches overlap in the suffix lookbehind
infixes = ["±"]
suffixes = ["%"]
infix_re = compile_infix_regex(infixes)
suffix_re = compile_suffix_regex(suffixes)
tokenizer = Tokenizer(
en_vocab,
infix_finditer=infix_re.finditer,
suffix_search=suffix_re.search,
)
tokens = [t.text for t in tokenizer("±10%")]
assert tokens == ["±10", "%"]
explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
assert tokens == explain_tokens

View File

@ -683,6 +683,8 @@ cdef class Tokenizer:
infixes = infix_finditer(substring) infixes = infix_finditer(substring)
offset = 0 offset = 0
for match in infixes: for match in infixes:
if offset == 0 and match.start() == 0:
continue
if substring[offset : match.start()]: if substring[offset : match.start()]:
tokens.append(("TOKEN", substring[offset : match.start()])) tokens.append(("TOKEN", substring[offset : match.start()]))
if substring[match.start() : match.end()]: if substring[match.start() : match.end()]:

View File

@ -10,7 +10,7 @@ from ..lexeme import Lexeme
from ..vocab import Vocab from ..vocab import Vocab
from .underscore import Underscore from .underscore import Underscore
from pathlib import Path from pathlib import Path
import numpy import numpy as np
class DocMethod(Protocol): class DocMethod(Protocol):
def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc] def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
@ -26,7 +26,7 @@ class Doc:
user_hooks: Dict[str, Callable[..., Any]] user_hooks: Dict[str, Callable[..., Any]]
user_token_hooks: Dict[str, Callable[..., Any]] user_token_hooks: Dict[str, Callable[..., Any]]
user_span_hooks: Dict[str, Callable[..., Any]] user_span_hooks: Dict[str, Callable[..., Any]]
tensor: numpy.ndarray tensor: np.ndarray[Any, np.dtype[np.float_]]
user_data: Dict[str, Any] user_data: Dict[str, Any]
has_unknown_spaces: bool has_unknown_spaces: bool
_context: Any _context: Any
@ -144,7 +144,7 @@ class Doc:
) -> Doc: ... ) -> Doc: ...
def to_array( def to_array(
self, py_attr_ids: Union[int, str, List[Union[int, str]]] self, py_attr_ids: Union[int, str, List[Union[int, str]]]
) -> numpy.ndarray: ... ) -> np.ndarray[Any, np.dtype[np.float_]]: ...
@staticmethod @staticmethod
def from_docs( def from_docs(
docs: List[Doc], docs: List[Doc],

View File

@ -459,8 +459,8 @@ cdef class Span:
@property @property
def ents(self): def ents(self):
"""The named entities in the span. Returns a tuple of named entity """The named entities that fall completely within the span. Returns
`Span` objects, if the entity recognizer has been applied. a tuple of `Span` objects.
RETURNS (tuple): Entities in the span, one `Span` per entity. RETURNS (tuple): Entities in the span, one `Span` per entity.

View File

@ -1,17 +1,31 @@
from typing import Dict, Any from typing import Dict, Any, List, Optional, Tuple, Union, TYPE_CHECKING
import functools import functools
import copy import copy
from ..errors import Errors from ..errors import Errors
if TYPE_CHECKING:
from .doc import Doc
from .span import Span
from .token import Token
class Underscore: class Underscore:
mutable_types = (dict, list, set) mutable_types = (dict, list, set)
doc_extensions: Dict[Any, Any] = {} doc_extensions: Dict[Any, Any] = {}
span_extensions: Dict[Any, Any] = {} span_extensions: Dict[Any, Any] = {}
token_extensions: Dict[Any, Any] = {} token_extensions: Dict[Any, Any] = {}
_extensions: Dict[str, Any]
_obj: Union["Doc", "Span", "Token"]
_start: Optional[int]
_end: Optional[int]
def __init__(self, extensions, obj, start=None, end=None): def __init__(
self,
extensions: Dict[str, Any],
obj: Union["Doc", "Span", "Token"],
start: Optional[int] = None,
end: Optional[int] = None,
):
object.__setattr__(self, "_extensions", extensions) object.__setattr__(self, "_extensions", extensions)
object.__setattr__(self, "_obj", obj) object.__setattr__(self, "_obj", obj)
# Assumption is that for doc values, _start and _end will both be None # Assumption is that for doc values, _start and _end will both be None
@ -23,12 +37,12 @@ class Underscore:
object.__setattr__(self, "_start", start) object.__setattr__(self, "_start", start)
object.__setattr__(self, "_end", end) object.__setattr__(self, "_end", end)
def __dir__(self): def __dir__(self) -> List[str]:
# Hack to enable autocomplete on custom extensions # Hack to enable autocomplete on custom extensions
extensions = list(self._extensions.keys()) extensions = list(self._extensions.keys())
return ["set", "get", "has"] + extensions return ["set", "get", "has"] + extensions
def __getattr__(self, name): def __getattr__(self, name: str) -> Any:
if name not in self._extensions: if name not in self._extensions:
raise AttributeError(Errors.E046.format(name=name)) raise AttributeError(Errors.E046.format(name=name))
default, method, getter, setter = self._extensions[name] default, method, getter, setter = self._extensions[name]
@ -56,7 +70,7 @@ class Underscore:
return new_default return new_default
return default return default
def __setattr__(self, name, value): def __setattr__(self, name: str, value: Any):
if name not in self._extensions: if name not in self._extensions:
raise AttributeError(Errors.E047.format(name=name)) raise AttributeError(Errors.E047.format(name=name))
default, method, getter, setter = self._extensions[name] default, method, getter, setter = self._extensions[name]
@ -65,28 +79,30 @@ class Underscore:
else: else:
self._doc.user_data[self._get_key(name)] = value self._doc.user_data[self._get_key(name)] = value
def set(self, name, value): def set(self, name: str, value: Any):
return self.__setattr__(name, value) return self.__setattr__(name, value)
def get(self, name): def get(self, name: str) -> Any:
return self.__getattr__(name) return self.__getattr__(name)
def has(self, name): def has(self, name: str) -> bool:
return name in self._extensions return name in self._extensions
def _get_key(self, name): def _get_key(self, name: str) -> Tuple[str, str, Optional[int], Optional[int]]:
return ("._.", name, self._start, self._end) return ("._.", name, self._start, self._end)
@classmethod @classmethod
def get_state(cls): def get_state(cls) -> Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]:
return cls.token_extensions, cls.span_extensions, cls.doc_extensions return cls.token_extensions, cls.span_extensions, cls.doc_extensions
@classmethod @classmethod
def load_state(cls, state): def load_state(
cls, state: Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]
) -> None:
cls.token_extensions, cls.span_extensions, cls.doc_extensions = state cls.token_extensions, cls.span_extensions, cls.doc_extensions = state
def get_ext_args(**kwargs): def get_ext_args(**kwargs: Any):
"""Validate and convert arguments. Reused in Doc, Token and Span.""" """Validate and convert arguments. Reused in Doc, Token and Span."""
default = kwargs.get("default") default = kwargs.get("default")
getter = kwargs.get("getter") getter = kwargs.get("getter")

16
website/Dockerfile Normal file
View File

@ -0,0 +1,16 @@
FROM node:11.15.0
WORKDIR /spacy-io
RUN npm install -g gatsby-cli@2.7.4
COPY package.json .
COPY package-lock.json .
RUN npm install
# This is so the installed node_modules will be up one directory
# from where a user mounts files, so that they don't accidentally mount
# their own node_modules from a different build
# https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
WORKDIR /spacy-io/website/

View File

@ -554,6 +554,42 @@ extensions for your code editor. The
[`.prettierrc`](https://github.com/explosion/spaCy/tree/master/website/.prettierrc) [`.prettierrc`](https://github.com/explosion/spaCy/tree/master/website/.prettierrc)
file in the root defines the settings used in this codebase. file in the root defines the settings used in this codebase.
## Building & developing the site with Docker {#docker}
Sometimes it's hard to get a local environment working due to rapid updates to node dependencies,
so it may be easier to use docker for building the docs.
If you'd like to do this,
**be sure you do *not* include your local `node_modules` folder**,
since there are some dependencies that need to be built for the image system.
Rename it before using.
```bash
docker run -it \
-v $(pwd):/spacy-io/website \
-p 8000:8000 \
ghcr.io/explosion/spacy-io \
gatsby develop -H 0.0.0.0
```
This will allow you to access the built website at http://0.0.0.0:8000/
in your browser, and still edit code in your editor while having the site
reflect those changes.
**Note**: If you're working on a Mac with an M1 processor,
you might see segfault errors from `qemu` if you use the default image.
To fix this use the `arm64` tagged image in the `docker run` command
(ghcr.io/explosion/spacy-io:arm64).
### Building the Docker image {#docker-build}
If you'd like to build the image locally, you can do so like this:
```bash
docker build -t spacy-io .
```
This will take some time, so if you want to use the prebuilt image you'll save a bit of time.
## Markdown reference {#markdown} ## Markdown reference {#markdown}
All page content and page meta lives in the `.md` files in the `/docs` All page content and page meta lives in the `.md` files in the `/docs`

View File

@ -257,8 +257,8 @@ shape `(N, M)`, where `N` is the length of the document. The values will be
## Span.ents {#ents tag="property" new="2.0.13" model="ner"} ## Span.ents {#ents tag="property" new="2.0.13" model="ner"}
The named entities in the span. Returns a tuple of named entity `Span` objects, The named entities that fall completely within the span. Returns a tuple of
if the entity recognizer has been applied. `Span` objects.
> #### Example > #### Example
> >

View File

@ -831,6 +831,8 @@ def tokenizer_pseudo_code(
infixes = infix_finditer(substring) infixes = infix_finditer(substring)
offset = 0 offset = 0
for match in infixes: for match in infixes:
if offset == 0 and match.start() == 0:
continue
tokens.append(substring[offset : match.start()]) tokens.append(substring[offset : match.start()])
tokens.append(substring[match.start() : match.end()]) tokens.append(substring[match.start() : match.end()])
offset = match.end() offset = match.end()

View File

@ -141,7 +141,8 @@
"website": "https://www.nr.no/~plison" "website": "https://www.nr.no/~plison"
}, },
"category": ["pipeline", "standalone", "research", "training"], "category": ["pipeline", "standalone", "research", "training"],
"tags": [] "tags": [],
"spacy_version": 3
}, },
{ {
"id": "numerizer", "id": "numerizer",
@ -977,6 +978,48 @@
"category": ["pipeline"], "category": ["pipeline"],
"tags": ["pipeline", "danish"] "tags": ["pipeline", "danish"]
}, },
{
"id": "spacy-wrap",
"title": "spaCy-wrap",
"slogan": "For Wrapping fine-tuned transformers in spaCy pipelines",
"description": "spaCy-wrap is a wrapper library for spaCy for including fine-tuned transformers from Huggingface in your spaCy pipeline allowing inclusion of existing models within existing workflows.",
"github": "kennethenevoldsen/spacy-wrap",
"pip": "spacy_wrap",
"code_example": [
"import spacy",
"import spacy_wrap",
"",
"nlp = spacy.blank('en')",
"config = {",
" 'doc_extension_trf_data': 'clf_trf_data', # document extention for the forward pass",
" 'doc_extension_prediction': 'sentiment', # document extention for the prediction",
" 'labels': ['negative', 'neutral', 'positive'],",
" 'model': {",
" 'name': 'cardiffnlp/twitter-roberta-base-sentiment', # the model name or path of huggingface model",
"},",
"}",
"",
"transformer = nlp.add_pipe('classification_transformer', config=config)",
"transformer.model.initialize()",
"",
"doc = nlp('spaCy is a wonderful tool')",
"",
"print(doc._.clf_trf_data)",
"# TransformerData(wordpieces=...",
"print(doc._.sentiment)",
"# 'positive'",
"print(doc._.sentiment_prob)",
"# {'prob': array([0.004, 0.028, 0.969], dtype=float32), 'labels': ['negative', 'neutral', 'positive']}"
],
"thumb": "https://raw.githubusercontent.com/KennethEnevoldsen/spacy-wrap/main/docs/_static/icon.png",
"author": "Kenneth Enevoldsen",
"author_links": {
"github": "KennethEnevoldsen",
"website": "https://www.kennethenevoldsen.com"
},
"category": ["pipeline", "models", "training"],
"tags": ["pipeline", "models", "transformers"]
},
{ {
"id": "textdescriptives", "id": "textdescriptives",
"title": "TextDescriptives", "title": "TextDescriptives",

View File

@ -8,10 +8,11 @@ import Title from '../components/title'
import Grid from '../components/grid' import Grid from '../components/grid'
import Button from '../components/button' import Button from '../components/button'
import Icon from '../components/icon' import Icon from '../components/icon'
import Tag from '../components/tag'
import CodeBlock, { InlineCode } from '../components/code' import CodeBlock, { InlineCode } from '../components/code'
import Aside from '../components/aside' import Aside from '../components/aside'
import Sidebar from '../components/sidebar' import Sidebar from '../components/sidebar'
import Section from '../components/section' import Section, { Hr } from '../components/section'
import Main from '../components/main' import Main from '../components/main'
import Footer from '../components/footer' import Footer from '../components/footer'
import { H3, H5, Label, InlineList } from '../components/typography' import { H3, H5, Label, InlineList } from '../components/typography'
@ -121,6 +122,18 @@ const UniverseContent = ({ content = [], categories, theme, pageContext, mdxComp
</Grid> </Grid>
</Section> </Section>
)} )}
<section className="search-exclude">
<H3>Found a mistake or something isn't working?</H3>
<p>
If you've come across a universe project that isn't working or is
incompatible with the reported spaCy version, let us know by{' '}
<Link to="https://github.com/explosion/spaCy/discussions/new">
opening a discussion thread
</Link>
.
</p>
</section>
<Hr />
<section className="search-exclude"> <section className="search-exclude">
<H3>Submit your project</H3> <H3>Submit your project</H3>
<p> <p>
@ -168,25 +181,41 @@ UniverseContent.propTypes = {
mdxComponents: PropTypes.object, mdxComponents: PropTypes.object,
} }
const SpaCyVersion = ({ version }) => {
const versions = !Array.isArray(version) ? [version] : version
return versions.map((v, i) => (
<>
<Tag tooltip={`This project is compatible with spaCy v${v}`}>spaCy v{v}</Tag>{' '}
</>
))
}
const Project = ({ data, components }) => ( const Project = ({ data, components }) => (
<> <>
<Title title={data.title || data.id} teaser={data.slogan} image={data.thumb}> <Title title={data.title || data.id} teaser={data.slogan} image={data.thumb}>
{data.github && ( {(data.github || data.spacy_version) && (
<p> <p>
<Link to={`https://github.com/${data.github}`} hidden> {data.spacy_version && <SpaCyVersion version={data.spacy_version} />}
{[ {data.github && (
`release/${data.github}/all.svg?style=flat-square`, <Link to={`https://github.com/${data.github}`} hidden>
`license/${data.github}.svg?style=flat-square`, {[
`stars/${data.github}.svg?style=social&label=Stars`, `release/${data.github}/all.svg?style=flat-square`,
].map((url, i) => ( `license/${data.github}.svg?style=flat-square`,
<img `stars/${data.github}.svg?style=social&label=Stars`,
style={{ borderRadius: '1em', marginRight: '0.5rem' }} ].map((url, i) => (
key={i} <img
src={`https://img.shields.io/github/${url}`} style={{
alt="" borderRadius: '1em',
/> marginRight: '0.5rem',
))} verticalAlign: 'middle',
</Link> }}
key={i}
src={`https://img.shields.io/github/${url}`}
alt=""
/>
))}
</Link>
)}
</p> </p>
)} )}
</Title> </Title>
@ -335,6 +364,7 @@ const query = graphql`
url url
github github
description description
spacy_version
pip pip
cran cran
category category