diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index b9831fe0c..ab7c20d48 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -699,9 +699,7 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
return count
-def _get_labels_from_model(
- nlp: Language, factory_name: str
-) -> Set[str]:
+def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]:
pipe_names = [
pipe_name
for pipe_name in nlp.pipe_names
@@ -714,9 +712,7 @@ def _get_labels_from_model(
return labels
-def _get_labels_from_spancat(
- nlp: Language
-) -> Dict[str, Set[str]]:
+def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
pipe_names = [
pipe_name
for pipe_name in nlp.pipe_names
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index b78806fec..fb79a4f60 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -6,6 +6,11 @@ can help generate the best possible configuration, given a user's requirements.
[paths]
train = null
dev = null
+{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
+vectors = null
+{% else -%}
+vectors = "{{ word_vectors }}"
+{% endif -%}
[system]
{% if use_transformer -%}
@@ -421,8 +426,4 @@ compound = 1.001
{% endif %}
[initialize]
-{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = ${paths.vectors}
-{% else -%}
-vectors = "{{ word_vectors }}"
-{% endif -%}
diff --git a/spacy/lang/hi/lex_attrs.py b/spacy/lang/hi/lex_attrs.py
index a18c2e513..ee845e8b1 100644
--- a/spacy/lang/hi/lex_attrs.py
+++ b/spacy/lang/hi/lex_attrs.py
@@ -90,7 +90,7 @@ _eleven_to_beyond = [
"अड़सठ",
"उनहत्तर",
"सत्तर",
- "इकहत्तर"
+ "इकहत्तर",
"बहत्तर",
"तिहत्तर",
"चौहत्तर",
diff --git a/spacy/lang/xx/examples.py b/spacy/lang/xx/examples.py
index 8d63c3c20..34570d747 100644
--- a/spacy/lang/xx/examples.py
+++ b/spacy/lang/xx/examples.py
@@ -59,7 +59,7 @@ sentences = [
"Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
"Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.",
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares.",
- "Carros autônomos empurram a responsabilidade do seguro para os fabricantes.."
+ "Carros autônomos empurram a responsabilidade do seguro para os fabricantes..",
"São Francisco considera banir os robôs de entrega que andam pelas calçadas.",
"Londres é a maior cidade do Reino Unido.",
# Translations from English:
diff --git a/spacy/language.py b/spacy/language.py
index 798254b80..fdce34ac4 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -354,12 +354,15 @@ class Language:
@property
def pipe_labels(self) -> Dict[str, List[str]]:
"""Get the labels set by the pipeline components, if available (if
- the component exposes a labels property).
+ the component exposes a labels property and the labels are not
+ hidden).
RETURNS (Dict[str, List[str]]): Labels keyed by component name.
"""
labels = {}
for name, pipe in self._components:
+ if hasattr(pipe, "hide_labels") and pipe.hide_labels is True:
+ continue
if hasattr(pipe, "labels"):
labels[name] = list(pipe.labels)
return SimpleFrozenDict(labels)
@@ -522,7 +525,7 @@ class Language:
requires: Iterable[str] = SimpleFrozenList(),
retokenizes: bool = False,
func: Optional["Pipe"] = None,
- ) -> Callable:
+ ) -> Callable[..., Any]:
"""Register a new pipeline component. Can be used for stateless function
components that don't require a separate factory. Can be used as a
decorator on a function or classmethod, or called as a function with the
diff --git a/spacy/matcher/dependencymatcher.pyi b/spacy/matcher/dependencymatcher.pyi
new file mode 100644
index 000000000..c19d3a71c
--- /dev/null
+++ b/spacy/matcher/dependencymatcher.pyi
@@ -0,0 +1,66 @@
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from .matcher import Matcher
+from ..vocab import Vocab
+from ..tokens.doc import Doc
+from ..tokens.span import Span
+
+class DependencyMatcher:
+ """Match dependency parse tree based on pattern rules."""
+
+ _patterns: Dict[str, List[Any]]
+ _raw_patterns: Dict[str, List[Any]]
+ _tokens_to_key: Dict[str, List[Any]]
+ _root: Dict[str, List[Any]]
+ _tree: Dict[str, List[Any]]
+ _callbacks: Dict[
+ Any, Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
+ ]
+ _ops: Dict[str, Any]
+ vocab: Vocab
+ _matcher: Matcher
+ def __init__(self, vocab: Vocab, *, validate: bool = ...) -> None: ...
+ def __reduce__(
+ self,
+ ) -> Tuple[
+ Callable[
+ [Vocab, Dict[str, Any], Dict[str, Callable[..., Any]]], DependencyMatcher
+ ],
+ Tuple[
+ Vocab,
+ Dict[str, List[Any]],
+ Dict[
+ str,
+ Callable[
+ [DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any
+ ],
+ ],
+ ],
+ None,
+ None,
+ ]: ...
+ def __len__(self) -> int: ...
+ def __contains__(self, key: Union[str, int]) -> bool: ...
+ def add(
+ self,
+ key: Union[str, int],
+ patterns: List[List[Dict[str, Any]]],
+ *,
+ on_match: Optional[
+ Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
+ ] = ...
+ ) -> None: ...
+ def has_key(self, key: Union[str, int]) -> bool: ...
+ def get(
+ self, key: Union[str, int], default: Optional[Any] = ...
+ ) -> Tuple[
+ Optional[
+ Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
+ ],
+ List[List[Dict[str, Any]]],
+ ]: ...
+ def remove(self, key: Union[str, int]) -> None: ...
+ def __call__(self, doclike: Union[Doc, Span]) -> List[Tuple[int, List[int]]]: ...
+
+def unpickle_matcher(
+ vocab: Vocab, patterns: Dict[str, Any], callbacks: Dict[str, Callable[..., Any]]
+) -> DependencyMatcher: ...
diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi
index ec4a88eaf..390629ff8 100644
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@@ -1,4 +1,6 @@
-from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Iterator, Iterable
+from typing import Any, List, Dict, Tuple, Optional, Callable, Union
+from typing import Iterator, Iterable, overload
+from ..compat import Literal
from ..vocab import Vocab
from ..tokens import Doc, Span
@@ -31,12 +33,22 @@ class Matcher:
) -> Union[
Iterator[Tuple[Tuple[Doc, Any], Any]], Iterator[Tuple[Doc, Any]], Iterator[Doc]
]: ...
+ @overload
def __call__(
self,
doclike: Union[Doc, Span],
*,
- as_spans: bool = ...,
+ as_spans: Literal[False] = ...,
allow_missing: bool = ...,
with_alignments: bool = ...
- ) -> Union[List[Tuple[int, int, int]], List[Span]]: ...
+ ) -> List[Tuple[int, int, int]]: ...
+ @overload
+ def __call__(
+ self,
+ doclike: Union[Doc, Span],
+ *,
+ as_spans: Literal[True],
+ allow_missing: bool = ...,
+ with_alignments: bool = ...
+ ) -> List[Span]: ...
def _normalize_key(self, key: Any) -> Any: ...
diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi
index 741bf7bb6..82a194835 100644
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@@ -1,6 +1,6 @@
-from typing import List, Tuple, Union, Optional, Callable, Any, Dict
-
-from . import Matcher
+from typing import List, Tuple, Union, Optional, Callable, Any, Dict, overload
+from ..compat import Literal
+from .matcher import Matcher
from ..vocab import Vocab
from ..tokens import Doc, Span
@@ -21,9 +21,17 @@ class PhraseMatcher:
] = ...,
) -> None: ...
def remove(self, key: str) -> None: ...
+ @overload
def __call__(
self,
doclike: Union[Doc, Span],
*,
- as_spans: bool = ...,
- ) -> Union[List[Tuple[int, int, int]], List[Span]]: ...
+ as_spans: Literal[False] = ...,
+ ) -> List[Tuple[int, int, int]]: ...
+ @overload
+ def __call__(
+ self,
+ doclike: Union[Doc, Span],
+ *,
+ as_spans: Literal[True],
+ ) -> List[Span]: ...
diff --git a/spacy/pipeline/pipe.pyi b/spacy/pipeline/pipe.pyi
index c7c0568f9..9dd6a9d50 100644
--- a/spacy/pipeline/pipe.pyi
+++ b/spacy/pipeline/pipe.pyi
@@ -26,6 +26,8 @@ class Pipe:
@property
def labels(self) -> Tuple[str, ...]: ...
@property
+ def hide_labels(self) -> bool: ...
+ @property
def label_data(self) -> Any: ...
def _require_labels(self) -> None: ...
def set_error_handler(
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 9eddc1e3f..d24e4d574 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -102,6 +102,10 @@ cdef class Pipe:
def labels(self) -> Tuple[str, ...]:
return tuple()
+ @property
+ def hide_labels(self) -> bool:
+ return False
+
@property
def label_data(self):
"""Optional JSON-serializable data that would be sufficient to recreate
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 2e0f364f0..6d00e829d 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -99,6 +99,10 @@ class SentenceRecognizer(Tagger):
# are 0
return tuple(["I", "S"])
+ @property
+ def hide_labels(self):
+ return True
+
@property
def label_data(self):
return None
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 01c9c407f..f5522f2d3 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -413,7 +413,7 @@ class SpanCategorizer(TrainablePipe):
self._require_labels()
if subbatch:
docs = [eg.x for eg in subbatch]
- spans = self.suggester(docs)
+ spans = build_ngram_suggester(sizes=[1])(docs)
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels))
self.model.initialize(X=(docs, spans), Y=Y)
else:
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 7a256f79b..047f59bef 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -97,3 +97,7 @@ def test_overfitting_IO():
]
assert_equal(batch_deps_1, batch_deps_2)
assert_equal(batch_deps_1, no_batch_deps)
+
+ # test internal pipe labels vs. Language.pipe_labels with hidden labels
+ assert nlp.get_pipe("senter").labels == ("I", "S")
+ assert "senter" not in nlp.pipe_labels
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 39d2e97da..8060bc621 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -79,7 +79,8 @@ def test_explicit_labels():
nlp.initialize()
assert spancat.labels == ("PERSON", "LOC")
-#TODO figure out why this is flaky
+
+# TODO figure out why this is flaky
@pytest.mark.skip(reason="Test is unreliable for unknown reason")
def test_doc_gc():
# If the Doc object is garbage collected, the spans won't be functional afterwards
diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index c2aeffcb5..a7270cb1e 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc
from spacy.training import Example
from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path
+from spacy.util import compile_infix_regex
from spacy.vocab import Vocab
from spacy.symbols import ORTH
@@ -503,3 +504,20 @@ def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
assert tokens == ["a", "10", "."]
explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
assert tokens == explain_tokens
+
+
+def test_tokenizer_infix_prefix(en_vocab):
+ # the prefix and suffix matches overlap in the suffix lookbehind
+ infixes = ["±"]
+ suffixes = ["%"]
+ infix_re = compile_infix_regex(infixes)
+ suffix_re = compile_suffix_regex(suffixes)
+ tokenizer = Tokenizer(
+ en_vocab,
+ infix_finditer=infix_re.finditer,
+ suffix_search=suffix_re.search,
+ )
+ tokens = [t.text for t in tokenizer("±10%")]
+ assert tokens == ["±10", "%"]
+ explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
+ assert tokens == explain_tokens
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 4a148b356..91f228032 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -683,6 +683,8 @@ cdef class Tokenizer:
infixes = infix_finditer(substring)
offset = 0
for match in infixes:
+ if offset == 0 and match.start() == 0:
+ continue
if substring[offset : match.start()]:
tokens.append(("TOKEN", substring[offset : match.start()]))
if substring[match.start() : match.end()]:
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index f540002c9..7e9340d58 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -10,7 +10,7 @@ from ..lexeme import Lexeme
from ..vocab import Vocab
from .underscore import Underscore
from pathlib import Path
-import numpy
+import numpy as np
class DocMethod(Protocol):
def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
@@ -26,7 +26,7 @@ class Doc:
user_hooks: Dict[str, Callable[..., Any]]
user_token_hooks: Dict[str, Callable[..., Any]]
user_span_hooks: Dict[str, Callable[..., Any]]
- tensor: numpy.ndarray
+ tensor: np.ndarray[Any, np.dtype[np.float_]]
user_data: Dict[str, Any]
has_unknown_spaces: bool
_context: Any
@@ -144,7 +144,7 @@ class Doc:
) -> Doc: ...
def to_array(
self, py_attr_ids: Union[int, str, List[Union[int, str]]]
- ) -> numpy.ndarray: ...
+ ) -> np.ndarray[Any, np.dtype[np.float_]]: ...
@staticmethod
def from_docs(
docs: List[Doc],
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index fc9a8c9d4..4b0c724e5 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -459,8 +459,8 @@ cdef class Span:
@property
def ents(self):
- """The named entities in the span. Returns a tuple of named entity
- `Span` objects, if the entity recognizer has been applied.
+ """The named entities that fall completely within the span. Returns
+ a tuple of `Span` objects.
RETURNS (tuple): Entities in the span, one `Span` per entity.
diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py
index 7fa7bf095..e9a4e1862 100644
--- a/spacy/tokens/underscore.py
+++ b/spacy/tokens/underscore.py
@@ -1,17 +1,31 @@
-from typing import Dict, Any
+from typing import Dict, Any, List, Optional, Tuple, Union, TYPE_CHECKING
import functools
import copy
-
from ..errors import Errors
+if TYPE_CHECKING:
+ from .doc import Doc
+ from .span import Span
+ from .token import Token
+
class Underscore:
mutable_types = (dict, list, set)
doc_extensions: Dict[Any, Any] = {}
span_extensions: Dict[Any, Any] = {}
token_extensions: Dict[Any, Any] = {}
+ _extensions: Dict[str, Any]
+ _obj: Union["Doc", "Span", "Token"]
+ _start: Optional[int]
+ _end: Optional[int]
- def __init__(self, extensions, obj, start=None, end=None):
+ def __init__(
+ self,
+ extensions: Dict[str, Any],
+ obj: Union["Doc", "Span", "Token"],
+ start: Optional[int] = None,
+ end: Optional[int] = None,
+ ):
object.__setattr__(self, "_extensions", extensions)
object.__setattr__(self, "_obj", obj)
# Assumption is that for doc values, _start and _end will both be None
@@ -23,12 +37,12 @@ class Underscore:
object.__setattr__(self, "_start", start)
object.__setattr__(self, "_end", end)
- def __dir__(self):
+ def __dir__(self) -> List[str]:
# Hack to enable autocomplete on custom extensions
extensions = list(self._extensions.keys())
return ["set", "get", "has"] + extensions
- def __getattr__(self, name):
+ def __getattr__(self, name: str) -> Any:
if name not in self._extensions:
raise AttributeError(Errors.E046.format(name=name))
default, method, getter, setter = self._extensions[name]
@@ -56,7 +70,7 @@ class Underscore:
return new_default
return default
- def __setattr__(self, name, value):
+ def __setattr__(self, name: str, value: Any):
if name not in self._extensions:
raise AttributeError(Errors.E047.format(name=name))
default, method, getter, setter = self._extensions[name]
@@ -65,28 +79,30 @@ class Underscore:
else:
self._doc.user_data[self._get_key(name)] = value
- def set(self, name, value):
+ def set(self, name: str, value: Any):
return self.__setattr__(name, value)
- def get(self, name):
+ def get(self, name: str) -> Any:
return self.__getattr__(name)
- def has(self, name):
+ def has(self, name: str) -> bool:
return name in self._extensions
- def _get_key(self, name):
+ def _get_key(self, name: str) -> Tuple[str, str, Optional[int], Optional[int]]:
return ("._.", name, self._start, self._end)
@classmethod
- def get_state(cls):
+ def get_state(cls) -> Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]:
return cls.token_extensions, cls.span_extensions, cls.doc_extensions
@classmethod
- def load_state(cls, state):
+ def load_state(
+ cls, state: Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]
+ ) -> None:
cls.token_extensions, cls.span_extensions, cls.doc_extensions = state
-def get_ext_args(**kwargs):
+def get_ext_args(**kwargs: Any):
"""Validate and convert arguments. Reused in Doc, Token and Span."""
default = kwargs.get("default")
getter = kwargs.get("getter")
diff --git a/website/Dockerfile b/website/Dockerfile
new file mode 100644
index 000000000..f71733e55
--- /dev/null
+++ b/website/Dockerfile
@@ -0,0 +1,16 @@
+FROM node:11.15.0
+
+WORKDIR /spacy-io
+
+RUN npm install -g gatsby-cli@2.7.4
+
+COPY package.json .
+COPY package-lock.json .
+
+RUN npm install
+
+# This is so the installed node_modules will be up one directory
+# from where a user mounts files, so that they don't accidentally mount
+# their own node_modules from a different build
+# https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
+WORKDIR /spacy-io/website/
diff --git a/website/README.md b/website/README.md
index 076032d92..db050cf03 100644
--- a/website/README.md
+++ b/website/README.md
@@ -554,6 +554,42 @@ extensions for your code editor. The
[`.prettierrc`](https://github.com/explosion/spaCy/tree/master/website/.prettierrc)
file in the root defines the settings used in this codebase.
+## Building & developing the site with Docker {#docker}
+Sometimes it's hard to get a local environment working due to rapid updates to node dependencies,
+so it may be easier to use docker for building the docs.
+
+If you'd like to do this,
+**be sure you do *not* include your local `node_modules` folder**,
+since there are some dependencies that need to be built for the image system.
+Rename it before using.
+
+```bash
+docker run -it \
+ -v $(pwd):/spacy-io/website \
+ -p 8000:8000 \
+ ghcr.io/explosion/spacy-io \
+ gatsby develop -H 0.0.0.0
+```
+
+This will allow you to access the built website at http://0.0.0.0:8000/
+in your browser, and still edit code in your editor while having the site
+reflect those changes.
+
+**Note**: If you're working on a Mac with an M1 processor,
+you might see segfault errors from `qemu` if you use the default image.
+To fix this use the `arm64` tagged image in the `docker run` command
+(ghcr.io/explosion/spacy-io:arm64).
+
+### Building the Docker image {#docker-build}
+
+If you'd like to build the image locally, you can do so like this:
+
+```bash
+docker build -t spacy-io .
+```
+
+This will take some time, so if you want to use the prebuilt image you'll save a bit of time.
+
## Markdown reference {#markdown}
All page content and page meta lives in the `.md` files in the `/docs`
diff --git a/website/docs/api/span.md b/website/docs/api/span.md
index 7ecebf93e..ff7905bc0 100644
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@@ -257,8 +257,8 @@ shape `(N, M)`, where `N` is the length of the document. The values will be
## Span.ents {#ents tag="property" new="2.0.13" model="ner"}
-The named entities in the span. Returns a tuple of named entity `Span` objects,
-if the entity recognizer has been applied.
+The named entities that fall completely within the span. Returns a tuple of
+`Span` objects.
> #### Example
>
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index f748fa8d6..f8baf5588 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -831,6 +831,8 @@ def tokenizer_pseudo_code(
infixes = infix_finditer(substring)
offset = 0
for match in infixes:
+ if offset == 0 and match.start() == 0:
+ continue
tokens.append(substring[offset : match.start()])
tokens.append(substring[match.start() : match.end()])
offset = match.end()
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 0fde2d612..b1a61598e 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -141,7 +141,8 @@
"website": "https://www.nr.no/~plison"
},
"category": ["pipeline", "standalone", "research", "training"],
- "tags": []
+ "tags": [],
+ "spacy_version": 3
},
{
"id": "numerizer",
@@ -977,6 +978,48 @@
"category": ["pipeline"],
"tags": ["pipeline", "danish"]
},
+ {
+ "id": "spacy-wrap",
+ "title": "spaCy-wrap",
+ "slogan": "For Wrapping fine-tuned transformers in spaCy pipelines",
+ "description": "spaCy-wrap is a wrapper library for spaCy for including fine-tuned transformers from Huggingface in your spaCy pipeline allowing inclusion of existing models within existing workflows.",
+ "github": "kennethenevoldsen/spacy-wrap",
+ "pip": "spacy_wrap",
+ "code_example": [
+ "import spacy",
+ "import spacy_wrap",
+ "",
+ "nlp = spacy.blank('en')",
+ "config = {",
+ " 'doc_extension_trf_data': 'clf_trf_data', # document extention for the forward pass",
+ " 'doc_extension_prediction': 'sentiment', # document extention for the prediction",
+ " 'labels': ['negative', 'neutral', 'positive'],",
+ " 'model': {",
+ " 'name': 'cardiffnlp/twitter-roberta-base-sentiment', # the model name or path of huggingface model",
+ "},",
+ "}",
+ "",
+ "transformer = nlp.add_pipe('classification_transformer', config=config)",
+ "transformer.model.initialize()",
+ "",
+ "doc = nlp('spaCy is a wonderful tool')",
+ "",
+ "print(doc._.clf_trf_data)",
+ "# TransformerData(wordpieces=...",
+ "print(doc._.sentiment)",
+ "# 'positive'",
+ "print(doc._.sentiment_prob)",
+ "# {'prob': array([0.004, 0.028, 0.969], dtype=float32), 'labels': ['negative', 'neutral', 'positive']}"
+ ],
+ "thumb": "https://raw.githubusercontent.com/KennethEnevoldsen/spacy-wrap/main/docs/_static/icon.png",
+ "author": "Kenneth Enevoldsen",
+ "author_links": {
+ "github": "KennethEnevoldsen",
+ "website": "https://www.kennethenevoldsen.com"
+ },
+ "category": ["pipeline", "models", "training"],
+ "tags": ["pipeline", "models", "transformers"]
+ },
{
"id": "textdescriptives",
"title": "TextDescriptives",
diff --git a/website/src/templates/universe.js b/website/src/templates/universe.js
index cfc8fdd0e..10f2520d9 100644
--- a/website/src/templates/universe.js
+++ b/website/src/templates/universe.js
@@ -8,10 +8,11 @@ import Title from '../components/title'
import Grid from '../components/grid'
import Button from '../components/button'
import Icon from '../components/icon'
+import Tag from '../components/tag'
import CodeBlock, { InlineCode } from '../components/code'
import Aside from '../components/aside'
import Sidebar from '../components/sidebar'
-import Section from '../components/section'
+import Section, { Hr } from '../components/section'
import Main from '../components/main'
import Footer from '../components/footer'
import { H3, H5, Label, InlineList } from '../components/typography'
@@ -121,6 +122,18 @@ const UniverseContent = ({ content = [], categories, theme, pageContext, mdxComp
)}
+
+ If you've come across a universe project that isn't working or is
+ incompatible with the reported spaCy version, let us know by{' '}
+
+ opening a discussion thread
+
+ .
+ Found a mistake or something isn't working?
+
@@ -168,25 +181,41 @@ UniverseContent.propTypes = {
mdxComponents: PropTypes.object,
}
+const SpaCyVersion = ({ version }) => {
+ const versions = !Array.isArray(version) ? [version] : version
+ return versions.map((v, i) => (
+ <>
+
-
- {[
- `release/${data.github}/all.svg?style=flat-square`,
- `license/${data.github}.svg?style=flat-square`,
- `stars/${data.github}.svg?style=social&label=Stars`,
- ].map((url, i) => (
-
- ))}
-
+ {data.spacy_version &&