Merge pull request #10215 from explosion/master

update develop
This commit is contained in:
Sofie Van Landeghem 2022-02-06 13:45:41 +01:00 committed by GitHub
commit 14513f82da
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 325 additions and 61 deletions

View File

@ -699,9 +699,7 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
return count
def _get_labels_from_model(
nlp: Language, factory_name: str
) -> Set[str]:
def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]:
pipe_names = [
pipe_name
for pipe_name in nlp.pipe_names
@ -714,9 +712,7 @@ def _get_labels_from_model(
return labels
def _get_labels_from_spancat(
nlp: Language
) -> Dict[str, Set[str]]:
def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
pipe_names = [
pipe_name
for pipe_name in nlp.pipe_names

View File

@ -6,6 +6,11 @@ can help generate the best possible configuration, given a user's requirements.
[paths]
train = null
dev = null
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = null
{% else -%}
vectors = "{{ word_vectors }}"
{% endif -%}
[system]
{% if use_transformer -%}
@ -421,8 +426,4 @@ compound = 1.001
{% endif %}
[initialize]
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = ${paths.vectors}
{% else -%}
vectors = "{{ word_vectors }}"
{% endif -%}

View File

@ -90,7 +90,7 @@ _eleven_to_beyond = [
"अड़सठ",
"उनहत्तर",
"सत्तर",
"इकहत्तर"
"इकहत्तर",
"बहत्तर",
"तिहत्तर",
"चौहत्तर",

View File

@ -59,7 +59,7 @@ sentences = [
"Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
"Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.",
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares.",
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes.."
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes..",
"São Francisco considera banir os robôs de entrega que andam pelas calçadas.",
"Londres é a maior cidade do Reino Unido.",
# Translations from English:

View File

@ -354,12 +354,15 @@ class Language:
@property
def pipe_labels(self) -> Dict[str, List[str]]:
"""Get the labels set by the pipeline components, if available (if
the component exposes a labels property).
the component exposes a labels property and the labels are not
hidden).
RETURNS (Dict[str, List[str]]): Labels keyed by component name.
"""
labels = {}
for name, pipe in self._components:
if hasattr(pipe, "hide_labels") and pipe.hide_labels is True:
continue
if hasattr(pipe, "labels"):
labels[name] = list(pipe.labels)
return SimpleFrozenDict(labels)
@ -522,7 +525,7 @@ class Language:
requires: Iterable[str] = SimpleFrozenList(),
retokenizes: bool = False,
func: Optional["Pipe"] = None,
) -> Callable:
) -> Callable[..., Any]:
"""Register a new pipeline component. Can be used for stateless function
components that don't require a separate factory. Can be used as a
decorator on a function or classmethod, or called as a function with the

View File

@ -0,0 +1,66 @@
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from .matcher import Matcher
from ..vocab import Vocab
from ..tokens.doc import Doc
from ..tokens.span import Span
class DependencyMatcher:
"""Match dependency parse tree based on pattern rules."""
_patterns: Dict[str, List[Any]]
_raw_patterns: Dict[str, List[Any]]
_tokens_to_key: Dict[str, List[Any]]
_root: Dict[str, List[Any]]
_tree: Dict[str, List[Any]]
_callbacks: Dict[
Any, Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
]
_ops: Dict[str, Any]
vocab: Vocab
_matcher: Matcher
def __init__(self, vocab: Vocab, *, validate: bool = ...) -> None: ...
def __reduce__(
self,
) -> Tuple[
Callable[
[Vocab, Dict[str, Any], Dict[str, Callable[..., Any]]], DependencyMatcher
],
Tuple[
Vocab,
Dict[str, List[Any]],
Dict[
str,
Callable[
[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any
],
],
],
None,
None,
]: ...
def __len__(self) -> int: ...
def __contains__(self, key: Union[str, int]) -> bool: ...
def add(
self,
key: Union[str, int],
patterns: List[List[Dict[str, Any]]],
*,
on_match: Optional[
Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
] = ...
) -> None: ...
def has_key(self, key: Union[str, int]) -> bool: ...
def get(
self, key: Union[str, int], default: Optional[Any] = ...
) -> Tuple[
Optional[
Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
],
List[List[Dict[str, Any]]],
]: ...
def remove(self, key: Union[str, int]) -> None: ...
def __call__(self, doclike: Union[Doc, Span]) -> List[Tuple[int, List[int]]]: ...
def unpickle_matcher(
vocab: Vocab, patterns: Dict[str, Any], callbacks: Dict[str, Callable[..., Any]]
) -> DependencyMatcher: ...

View File

@ -1,4 +1,6 @@
from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Iterator, Iterable
from typing import Any, List, Dict, Tuple, Optional, Callable, Union
from typing import Iterator, Iterable, overload
from ..compat import Literal
from ..vocab import Vocab
from ..tokens import Doc, Span
@ -31,12 +33,22 @@ class Matcher:
) -> Union[
Iterator[Tuple[Tuple[Doc, Any], Any]], Iterator[Tuple[Doc, Any]], Iterator[Doc]
]: ...
@overload
def __call__(
self,
doclike: Union[Doc, Span],
*,
as_spans: bool = ...,
as_spans: Literal[False] = ...,
allow_missing: bool = ...,
with_alignments: bool = ...
) -> Union[List[Tuple[int, int, int]], List[Span]]: ...
) -> List[Tuple[int, int, int]]: ...
@overload
def __call__(
self,
doclike: Union[Doc, Span],
*,
as_spans: Literal[True],
allow_missing: bool = ...,
with_alignments: bool = ...
) -> List[Span]: ...
def _normalize_key(self, key: Any) -> Any: ...

View File

@ -1,6 +1,6 @@
from typing import List, Tuple, Union, Optional, Callable, Any, Dict
from . import Matcher
from typing import List, Tuple, Union, Optional, Callable, Any, Dict, overload
from ..compat import Literal
from .matcher import Matcher
from ..vocab import Vocab
from ..tokens import Doc, Span
@ -21,9 +21,17 @@ class PhraseMatcher:
] = ...,
) -> None: ...
def remove(self, key: str) -> None: ...
@overload
def __call__(
self,
doclike: Union[Doc, Span],
*,
as_spans: bool = ...,
) -> Union[List[Tuple[int, int, int]], List[Span]]: ...
as_spans: Literal[False] = ...,
) -> List[Tuple[int, int, int]]: ...
@overload
def __call__(
self,
doclike: Union[Doc, Span],
*,
as_spans: Literal[True],
) -> List[Span]: ...

View File

@ -26,6 +26,8 @@ class Pipe:
@property
def labels(self) -> Tuple[str, ...]: ...
@property
def hide_labels(self) -> bool: ...
@property
def label_data(self) -> Any: ...
def _require_labels(self) -> None: ...
def set_error_handler(

View File

@ -102,6 +102,10 @@ cdef class Pipe:
def labels(self) -> Tuple[str, ...]:
return tuple()
@property
def hide_labels(self) -> bool:
return False
@property
def label_data(self):
"""Optional JSON-serializable data that would be sufficient to recreate

View File

@ -99,6 +99,10 @@ class SentenceRecognizer(Tagger):
# are 0
return tuple(["I", "S"])
@property
def hide_labels(self):
return True
@property
def label_data(self):
return None

View File

@ -413,7 +413,7 @@ class SpanCategorizer(TrainablePipe):
self._require_labels()
if subbatch:
docs = [eg.x for eg in subbatch]
spans = self.suggester(docs)
spans = build_ngram_suggester(sizes=[1])(docs)
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels))
self.model.initialize(X=(docs, spans), Y=Y)
else:

View File

@ -97,3 +97,7 @@ def test_overfitting_IO():
]
assert_equal(batch_deps_1, batch_deps_2)
assert_equal(batch_deps_1, no_batch_deps)
# test internal pipe labels vs. Language.pipe_labels with hidden labels
assert nlp.get_pipe("senter").labels == ("I", "S")
assert "senter" not in nlp.pipe_labels

View File

@ -79,7 +79,8 @@ def test_explicit_labels():
nlp.initialize()
assert spancat.labels == ("PERSON", "LOC")
#TODO figure out why this is flaky
# TODO figure out why this is flaky
@pytest.mark.skip(reason="Test is unreliable for unknown reason")
def test_doc_gc():
# If the Doc object is garbage collected, the spans won't be functional afterwards

View File

@ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc
from spacy.training import Example
from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path
from spacy.util import compile_infix_regex
from spacy.vocab import Vocab
from spacy.symbols import ORTH
@ -503,3 +504,20 @@ def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
assert tokens == ["a", "10", "."]
explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
assert tokens == explain_tokens
def test_tokenizer_infix_prefix(en_vocab):
# the prefix and suffix matches overlap in the suffix lookbehind
infixes = ["±"]
suffixes = ["%"]
infix_re = compile_infix_regex(infixes)
suffix_re = compile_suffix_regex(suffixes)
tokenizer = Tokenizer(
en_vocab,
infix_finditer=infix_re.finditer,
suffix_search=suffix_re.search,
)
tokens = [t.text for t in tokenizer("±10%")]
assert tokens == ["±10", "%"]
explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
assert tokens == explain_tokens

View File

@ -683,6 +683,8 @@ cdef class Tokenizer:
infixes = infix_finditer(substring)
offset = 0
for match in infixes:
if offset == 0 and match.start() == 0:
continue
if substring[offset : match.start()]:
tokens.append(("TOKEN", substring[offset : match.start()]))
if substring[match.start() : match.end()]:

View File

@ -10,7 +10,7 @@ from ..lexeme import Lexeme
from ..vocab import Vocab
from .underscore import Underscore
from pathlib import Path
import numpy
import numpy as np
class DocMethod(Protocol):
def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
@ -26,7 +26,7 @@ class Doc:
user_hooks: Dict[str, Callable[..., Any]]
user_token_hooks: Dict[str, Callable[..., Any]]
user_span_hooks: Dict[str, Callable[..., Any]]
tensor: numpy.ndarray
tensor: np.ndarray[Any, np.dtype[np.float_]]
user_data: Dict[str, Any]
has_unknown_spaces: bool
_context: Any
@ -144,7 +144,7 @@ class Doc:
) -> Doc: ...
def to_array(
self, py_attr_ids: Union[int, str, List[Union[int, str]]]
) -> numpy.ndarray: ...
) -> np.ndarray[Any, np.dtype[np.float_]]: ...
@staticmethod
def from_docs(
docs: List[Doc],

View File

@ -459,8 +459,8 @@ cdef class Span:
@property
def ents(self):
"""The named entities in the span. Returns a tuple of named entity
`Span` objects, if the entity recognizer has been applied.
"""The named entities that fall completely within the span. Returns
a tuple of `Span` objects.
RETURNS (tuple): Entities in the span, one `Span` per entity.

View File

@ -1,17 +1,31 @@
from typing import Dict, Any
from typing import Dict, Any, List, Optional, Tuple, Union, TYPE_CHECKING
import functools
import copy
from ..errors import Errors
if TYPE_CHECKING:
from .doc import Doc
from .span import Span
from .token import Token
class Underscore:
mutable_types = (dict, list, set)
doc_extensions: Dict[Any, Any] = {}
span_extensions: Dict[Any, Any] = {}
token_extensions: Dict[Any, Any] = {}
_extensions: Dict[str, Any]
_obj: Union["Doc", "Span", "Token"]
_start: Optional[int]
_end: Optional[int]
def __init__(self, extensions, obj, start=None, end=None):
def __init__(
self,
extensions: Dict[str, Any],
obj: Union["Doc", "Span", "Token"],
start: Optional[int] = None,
end: Optional[int] = None,
):
object.__setattr__(self, "_extensions", extensions)
object.__setattr__(self, "_obj", obj)
# Assumption is that for doc values, _start and _end will both be None
@ -23,12 +37,12 @@ class Underscore:
object.__setattr__(self, "_start", start)
object.__setattr__(self, "_end", end)
def __dir__(self):
def __dir__(self) -> List[str]:
# Hack to enable autocomplete on custom extensions
extensions = list(self._extensions.keys())
return ["set", "get", "has"] + extensions
def __getattr__(self, name):
def __getattr__(self, name: str) -> Any:
if name not in self._extensions:
raise AttributeError(Errors.E046.format(name=name))
default, method, getter, setter = self._extensions[name]
@ -56,7 +70,7 @@ class Underscore:
return new_default
return default
def __setattr__(self, name, value):
def __setattr__(self, name: str, value: Any):
if name not in self._extensions:
raise AttributeError(Errors.E047.format(name=name))
default, method, getter, setter = self._extensions[name]
@ -65,28 +79,30 @@ class Underscore:
else:
self._doc.user_data[self._get_key(name)] = value
def set(self, name, value):
def set(self, name: str, value: Any):
return self.__setattr__(name, value)
def get(self, name):
def get(self, name: str) -> Any:
return self.__getattr__(name)
def has(self, name):
def has(self, name: str) -> bool:
return name in self._extensions
def _get_key(self, name):
def _get_key(self, name: str) -> Tuple[str, str, Optional[int], Optional[int]]:
return ("._.", name, self._start, self._end)
@classmethod
def get_state(cls):
def get_state(cls) -> Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]:
return cls.token_extensions, cls.span_extensions, cls.doc_extensions
@classmethod
def load_state(cls, state):
def load_state(
cls, state: Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]
) -> None:
cls.token_extensions, cls.span_extensions, cls.doc_extensions = state
def get_ext_args(**kwargs):
def get_ext_args(**kwargs: Any):
"""Validate and convert arguments. Reused in Doc, Token and Span."""
default = kwargs.get("default")
getter = kwargs.get("getter")

16
website/Dockerfile Normal file
View File

@ -0,0 +1,16 @@
FROM node:11.15.0
WORKDIR /spacy-io
RUN npm install -g gatsby-cli@2.7.4
COPY package.json .
COPY package-lock.json .
RUN npm install
# This is so the installed node_modules will be up one directory
# from where a user mounts files, so that they don't accidentally mount
# their own node_modules from a different build
# https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
WORKDIR /spacy-io/website/

View File

@ -554,6 +554,42 @@ extensions for your code editor. The
[`.prettierrc`](https://github.com/explosion/spaCy/tree/master/website/.prettierrc)
file in the root defines the settings used in this codebase.
## Building & developing the site with Docker {#docker}
Sometimes it's hard to get a local environment working due to rapid updates to node dependencies,
so it may be easier to use docker for building the docs.
If you'd like to do this,
**be sure you do *not* include your local `node_modules` folder**,
since there are some dependencies that need to be built for the image system.
Rename it before using.
```bash
docker run -it \
-v $(pwd):/spacy-io/website \
-p 8000:8000 \
ghcr.io/explosion/spacy-io \
gatsby develop -H 0.0.0.0
```
This will allow you to access the built website at http://0.0.0.0:8000/
in your browser, and still edit code in your editor while having the site
reflect those changes.
**Note**: If you're working on a Mac with an M1 processor,
you might see segfault errors from `qemu` if you use the default image.
To fix this use the `arm64` tagged image in the `docker run` command
(ghcr.io/explosion/spacy-io:arm64).
### Building the Docker image {#docker-build}
If you'd like to build the image locally, you can do so like this:
```bash
docker build -t spacy-io .
```
This will take some time, so if you want to use the prebuilt image you'll save a bit of time.
## Markdown reference {#markdown}
All page content and page meta lives in the `.md` files in the `/docs`

View File

@ -257,8 +257,8 @@ shape `(N, M)`, where `N` is the length of the document. The values will be
## Span.ents {#ents tag="property" new="2.0.13" model="ner"}
The named entities in the span. Returns a tuple of named entity `Span` objects,
if the entity recognizer has been applied.
The named entities that fall completely within the span. Returns a tuple of
`Span` objects.
> #### Example
>

View File

@ -831,6 +831,8 @@ def tokenizer_pseudo_code(
infixes = infix_finditer(substring)
offset = 0
for match in infixes:
if offset == 0 and match.start() == 0:
continue
tokens.append(substring[offset : match.start()])
tokens.append(substring[match.start() : match.end()])
offset = match.end()

View File

@ -141,7 +141,8 @@
"website": "https://www.nr.no/~plison"
},
"category": ["pipeline", "standalone", "research", "training"],
"tags": []
"tags": [],
"spacy_version": 3
},
{
"id": "numerizer",
@ -977,6 +978,48 @@
"category": ["pipeline"],
"tags": ["pipeline", "danish"]
},
{
"id": "spacy-wrap",
"title": "spaCy-wrap",
"slogan": "For Wrapping fine-tuned transformers in spaCy pipelines",
"description": "spaCy-wrap is a wrapper library for spaCy for including fine-tuned transformers from Huggingface in your spaCy pipeline allowing inclusion of existing models within existing workflows.",
"github": "kennethenevoldsen/spacy-wrap",
"pip": "spacy_wrap",
"code_example": [
"import spacy",
"import spacy_wrap",
"",
"nlp = spacy.blank('en')",
"config = {",
" 'doc_extension_trf_data': 'clf_trf_data', # document extention for the forward pass",
" 'doc_extension_prediction': 'sentiment', # document extention for the prediction",
" 'labels': ['negative', 'neutral', 'positive'],",
" 'model': {",
" 'name': 'cardiffnlp/twitter-roberta-base-sentiment', # the model name or path of huggingface model",
"},",
"}",
"",
"transformer = nlp.add_pipe('classification_transformer', config=config)",
"transformer.model.initialize()",
"",
"doc = nlp('spaCy is a wonderful tool')",
"",
"print(doc._.clf_trf_data)",
"# TransformerData(wordpieces=...",
"print(doc._.sentiment)",
"# 'positive'",
"print(doc._.sentiment_prob)",
"# {'prob': array([0.004, 0.028, 0.969], dtype=float32), 'labels': ['negative', 'neutral', 'positive']}"
],
"thumb": "https://raw.githubusercontent.com/KennethEnevoldsen/spacy-wrap/main/docs/_static/icon.png",
"author": "Kenneth Enevoldsen",
"author_links": {
"github": "KennethEnevoldsen",
"website": "https://www.kennethenevoldsen.com"
},
"category": ["pipeline", "models", "training"],
"tags": ["pipeline", "models", "transformers"]
},
{
"id": "textdescriptives",
"title": "TextDescriptives",

View File

@ -8,10 +8,11 @@ import Title from '../components/title'
import Grid from '../components/grid'
import Button from '../components/button'
import Icon from '../components/icon'
import Tag from '../components/tag'
import CodeBlock, { InlineCode } from '../components/code'
import Aside from '../components/aside'
import Sidebar from '../components/sidebar'
import Section from '../components/section'
import Section, { Hr } from '../components/section'
import Main from '../components/main'
import Footer from '../components/footer'
import { H3, H5, Label, InlineList } from '../components/typography'
@ -121,6 +122,18 @@ const UniverseContent = ({ content = [], categories, theme, pageContext, mdxComp
</Grid>
</Section>
)}
<section className="search-exclude">
<H3>Found a mistake or something isn't working?</H3>
<p>
If you've come across a universe project that isn't working or is
incompatible with the reported spaCy version, let us know by{' '}
<Link to="https://github.com/explosion/spaCy/discussions/new">
opening a discussion thread
</Link>
.
</p>
</section>
<Hr />
<section className="search-exclude">
<H3>Submit your project</H3>
<p>
@ -168,11 +181,22 @@ UniverseContent.propTypes = {
mdxComponents: PropTypes.object,
}
const SpaCyVersion = ({ version }) => {
const versions = !Array.isArray(version) ? [version] : version
return versions.map((v, i) => (
<>
<Tag tooltip={`This project is compatible with spaCy v${v}`}>spaCy v{v}</Tag>{' '}
</>
))
}
const Project = ({ data, components }) => (
<>
<Title title={data.title || data.id} teaser={data.slogan} image={data.thumb}>
{data.github && (
{(data.github || data.spacy_version) && (
<p>
{data.spacy_version && <SpaCyVersion version={data.spacy_version} />}
{data.github && (
<Link to={`https://github.com/${data.github}`} hidden>
{[
`release/${data.github}/all.svg?style=flat-square`,
@ -180,13 +204,18 @@ const Project = ({ data, components }) => (
`stars/${data.github}.svg?style=social&label=Stars`,
].map((url, i) => (
<img
style={{ borderRadius: '1em', marginRight: '0.5rem' }}
style={{
borderRadius: '1em',
marginRight: '0.5rem',
verticalAlign: 'middle',
}}
key={i}
src={`https://img.shields.io/github/${url}`}
alt=""
/>
))}
</Link>
)}
</p>
)}
</Title>
@ -335,6 +364,7 @@ const query = graphql`
url
github
description
spacy_version
pip
cran
category