mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-06 21:30:22 +03:00
Merge remote-tracking branch 'upstream/master' into chore/update-v4-from-master-3
This commit is contained in:
commit
79c11de0c4
9
.github/workflows/autoblack.yml
vendored
9
.github/workflows/autoblack.yml
vendored
|
@ -12,10 +12,10 @@ jobs:
|
|||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.head_ref }}
|
||||
- uses: actions/setup-python@v2
|
||||
- uses: actions/setup-python@v3
|
||||
- run: pip install black
|
||||
- name: Auto-format code if needed
|
||||
run: black spacy
|
||||
|
@ -23,10 +23,11 @@ jobs:
|
|||
# code and makes GitHub think the action failed
|
||||
- name: Check for modified files
|
||||
id: git-check
|
||||
run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi)
|
||||
run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Create Pull Request
|
||||
if: steps.git-check.outputs.modified == 'true'
|
||||
uses: peter-evans/create-pull-request@v3
|
||||
uses: peter-evans/create-pull-request@v4
|
||||
with:
|
||||
title: Auto-format code with black
|
||||
labels: meta
|
||||
|
|
|
@ -8,7 +8,7 @@ be used in real products.
|
|||
|
||||
spaCy comes with
|
||||
[pretrained pipelines](https://spacy.io/models) and
|
||||
currently supports tokenization and training for **60+ languages**. It features
|
||||
currently supports tokenization and training for **70+ languages**. It features
|
||||
state-of-the-art speed and **neural network models** for tagging,
|
||||
parsing, **named entity recognition**, **text classification** and more,
|
||||
multi-task learning with pretrained **transformers** like BERT, as well as a
|
||||
|
@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
|
|||
model packaging, deployment and workflow management. spaCy is commercial
|
||||
open-source software, released under the MIT license.
|
||||
|
||||
💫 **Version 3.4.0 out now!**
|
||||
💫 **Version 3.4 out now!**
|
||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||
|
@ -79,7 +79,7 @@ more people can benefit from it.
|
|||
|
||||
## Features
|
||||
|
||||
- Support for **60+ languages**
|
||||
- Support for **70+ languages**
|
||||
- **Trained pipelines** for different languages and tasks
|
||||
- Multi-task learning with pretrained **transformers** like BERT
|
||||
- Support for pretrained **word vectors** and embeddings
|
||||
|
|
|
@ -76,15 +76,15 @@ jobs:
|
|||
# Python39Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.9"
|
||||
Python310Linux:
|
||||
imageName: "ubuntu-latest"
|
||||
python.version: "3.10"
|
||||
# Python310Linux:
|
||||
# imageName: "ubuntu-latest"
|
||||
# python.version: "3.10"
|
||||
Python310Windows:
|
||||
imageName: "windows-latest"
|
||||
python.version: "3.10"
|
||||
Python310Mac:
|
||||
imageName: "macos-latest"
|
||||
python.version: "3.10"
|
||||
# Python310Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.10"
|
||||
Python311Linux:
|
||||
imageName: 'ubuntu-latest'
|
||||
python.version: '3.11.0-rc.2'
|
||||
|
|
|
@ -71,11 +71,10 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab
|
|||
cands.append((start_token, end_token))
|
||||
|
||||
candidates.append(ops.asarray2i(cands))
|
||||
candlens = ops.asarray1i([len(cands) for cands in candidates])
|
||||
candidates = ops.xp.concatenate(candidates)
|
||||
outputs = Ragged(candidates, candlens)
|
||||
lengths = model.ops.asarray1i([len(cands) for cands in candidates])
|
||||
out = Ragged(model.ops.flatten(candidates), lengths)
|
||||
# because this is just rearranging docs, the backprop does nothing
|
||||
return outputs, lambda x: []
|
||||
return out, lambda x: []
|
||||
|
||||
|
||||
@registry.misc("spacy.KBFromFile.v1")
|
||||
|
|
|
@ -27,8 +27,8 @@ single_label_default_config = """
|
|||
[model.tok2vec.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v2"
|
||||
width = 64
|
||||
rows = [2000, 2000, 1000, 1000, 1000, 1000]
|
||||
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
||||
rows = [2000, 2000, 500, 1000, 500]
|
||||
attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
|
||||
include_static_vectors = false
|
||||
|
||||
[model.tok2vec.encode]
|
||||
|
|
|
@ -24,8 +24,8 @@ multi_label_default_config = """
|
|||
[model.tok2vec.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v2"
|
||||
width = 64
|
||||
rows = [2000, 2000, 1000, 1000, 1000, 1000]
|
||||
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
||||
rows = [2000, 2000, 500, 1000, 500]
|
||||
attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
|
||||
include_static_vectors = false
|
||||
|
||||
[model.tok2vec.encode]
|
||||
|
|
|
@ -10,6 +10,7 @@ from spacy.compat import pickle
|
|||
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
|
||||
from spacy.lang.en import English
|
||||
from spacy.ml import load_kb
|
||||
from spacy.ml.models.entity_linker import build_span_maker
|
||||
from spacy.pipeline import EntityLinker, TrainablePipe
|
||||
from spacy.pipeline.legacy import EntityLinker_v1
|
||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||
|
@ -716,7 +717,11 @@ TRAIN_DATA = [
|
|||
("Russ Cochran was a member of University of Kentucky's golf team.",
|
||||
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
|
||||
"entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
|
||||
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
|
||||
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
|
||||
# having a blank instance shouldn't break things
|
||||
("The weather is nice today.",
|
||||
{"links": {}, "entities": [],
|
||||
"sent_starts": [1, -1, 0, 0, 0, 0]})
|
||||
]
|
||||
GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
|
||||
# fmt: on
|
||||
|
@ -1260,3 +1265,18 @@ def test_save_activations():
|
|||
assert scores.data.shape == (2, 1)
|
||||
assert scores.data.dtype == "float32"
|
||||
assert scores.lengths.shape == (1,)
|
||||
|
||||
|
||||
def test_span_maker_forward_with_empty():
|
||||
"""The forward pass of the span maker may have a doc with no entities."""
|
||||
nlp = English()
|
||||
doc1 = nlp("a b c")
|
||||
ent = doc1[0:1]
|
||||
ent.label_ = "X"
|
||||
doc1.ents = [ent]
|
||||
# no entities
|
||||
doc2 = nlp("x y z")
|
||||
|
||||
# just to get a model
|
||||
span_maker = build_span_maker()
|
||||
span_maker([doc1, doc2], False)
|
||||
|
|
|
@ -231,7 +231,7 @@ def test_tok2vec_listener_callback():
|
|||
|
||||
|
||||
def test_tok2vec_listener_overfitting():
|
||||
""" Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components """
|
||||
"""Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components"""
|
||||
orig_config = Config().from_str(cfg_string)
|
||||
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||
train_examples = []
|
||||
|
@ -264,7 +264,7 @@ def test_tok2vec_listener_overfitting():
|
|||
|
||||
|
||||
def test_tok2vec_frozen_not_annotating():
|
||||
""" Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating """
|
||||
"""Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating"""
|
||||
orig_config = Config().from_str(cfg_string)
|
||||
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||
train_examples = []
|
||||
|
@ -274,12 +274,16 @@ def test_tok2vec_frozen_not_annotating():
|
|||
|
||||
for i in range(2):
|
||||
losses = {}
|
||||
with pytest.raises(ValueError, match=r"the tok2vec embedding layer is not updated"):
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"])
|
||||
with pytest.raises(
|
||||
ValueError, match=r"the tok2vec embedding layer is not updated"
|
||||
):
|
||||
nlp.update(
|
||||
train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"]
|
||||
)
|
||||
|
||||
|
||||
def test_tok2vec_frozen_overfitting():
|
||||
""" Test that a pipeline with a frozen & annotating tok2vec can still overfit """
|
||||
"""Test that a pipeline with a frozen & annotating tok2vec can still overfit"""
|
||||
orig_config = Config().from_str(cfg_string)
|
||||
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||
train_examples = []
|
||||
|
@ -289,7 +293,13 @@ def test_tok2vec_frozen_overfitting():
|
|||
|
||||
for i in range(100):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"], annotates=["tok2vec"])
|
||||
nlp.update(
|
||||
train_examples,
|
||||
sgd=optimizer,
|
||||
losses=losses,
|
||||
exclude=["tok2vec"],
|
||||
annotates=["tok2vec"],
|
||||
)
|
||||
assert losses["tagger"] < 0.0001
|
||||
|
||||
# test the trained model
|
||||
|
|
|
@ -23,7 +23,7 @@ def get_textcat_bow_kwargs():
|
|||
|
||||
|
||||
def get_textcat_cnn_kwargs():
|
||||
return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
|
||||
return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}
|
||||
|
||||
|
||||
def get_all_params(model):
|
||||
|
@ -65,7 +65,7 @@ def get_tok2vec_kwargs():
|
|||
}
|
||||
|
||||
|
||||
def test_tok2vec():
|
||||
def make_test_tok2vec():
|
||||
return build_Tok2Vec_model(**get_tok2vec_kwargs())
|
||||
|
||||
|
||||
|
|
|
@ -1791,7 +1791,7 @@ the entity `Span` – for example `._.orgs` or `._.prev_orgs` and
|
|||
> [`Doc.retokenize`](/api/doc#retokenize) context manager:
|
||||
>
|
||||
> ```python
|
||||
> with doc.retokenize() as retokenize:
|
||||
> with doc.retokenize() as retokenizer:
|
||||
> for ent in doc.ents:
|
||||
> retokenizer.merge(ent)
|
||||
> ```
|
||||
|
|
|
@ -4,12 +4,22 @@
|
|||
"code": "af",
|
||||
"name": "Afrikaans"
|
||||
},
|
||||
{
|
||||
"code": "am",
|
||||
"name": "Amharic",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "ar",
|
||||
"name": "Arabic",
|
||||
"example": "هذه جملة",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "az",
|
||||
"name": "Azerbaijani",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "bg",
|
||||
"name": "Bulgarian",
|
||||
|
@ -65,7 +75,7 @@
|
|||
{
|
||||
"code": "dsb",
|
||||
"name": "Lower Sorbian",
|
||||
"has_examples": true
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "el",
|
||||
|
@ -142,6 +152,11 @@
|
|||
"code": "ga",
|
||||
"name": "Irish"
|
||||
},
|
||||
{
|
||||
"code": "grc",
|
||||
"name": "Ancient Greek",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "gu",
|
||||
"name": "Gujarati",
|
||||
|
@ -172,7 +187,7 @@
|
|||
{
|
||||
"code": "hsb",
|
||||
"name": "Upper Sorbian",
|
||||
"has_examples": true
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "hu",
|
||||
|
@ -260,6 +275,10 @@
|
|||
"example": "Адамга эң кыйыны — күн сайын адам болуу",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "la",
|
||||
"name": "Latin"
|
||||
},
|
||||
{
|
||||
"code": "lb",
|
||||
"name": "Luxembourgish",
|
||||
|
@ -448,6 +467,11 @@
|
|||
"example": "นี่คือประโยค",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "ti",
|
||||
"name": "Tigrinya",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "tl",
|
||||
"name": "Tagalog"
|
||||
|
|
|
@ -149,6 +149,9 @@
|
|||
& > span
|
||||
display: block
|
||||
|
||||
a
|
||||
text-decoration: underline
|
||||
|
||||
.small
|
||||
font-size: var(--font-size-code)
|
||||
line-height: 1.65
|
||||
|
|
|
@ -159,6 +159,9 @@ const QuickstartInstall = ({ id, title }) => {
|
|||
setters={setters}
|
||||
showDropdown={showDropdown}
|
||||
>
|
||||
<QS os="mac" hardware="gpu" platform="arm">
|
||||
# Note M1 GPU support is experimental, see <a href="https://github.com/explosion/thinc/issues/792">Thinc issue #792</a>
|
||||
</QS>
|
||||
<QS package="pip" config="venv">
|
||||
python -m venv .env
|
||||
</QS>
|
||||
|
@ -198,7 +201,13 @@ const QuickstartInstall = ({ id, title }) => {
|
|||
{nightly ? ' --pre' : ''}
|
||||
</QS>
|
||||
<QS package="conda">conda install -c conda-forge spacy</QS>
|
||||
<QS package="conda" hardware="gpu">
|
||||
<QS package="conda" hardware="gpu" os="windows">
|
||||
conda install -c conda-forge cupy
|
||||
</QS>
|
||||
<QS package="conda" hardware="gpu" os="linux">
|
||||
conda install -c conda-forge cupy
|
||||
</QS>
|
||||
<QS package="conda" hardware="gpu" os="mac" platform="x86">
|
||||
conda install -c conda-forge cupy
|
||||
</QS>
|
||||
<QS package="conda" config="train">
|
||||
|
|
Loading…
Reference in New Issue
Block a user