mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-16 11:12:25 +03:00
Merge branch 'master' into feature/candidate-generation-by-docs
This commit is contained in:
commit
77680421b4
2
.github/azure-steps.yml
vendored
2
.github/azure-steps.yml
vendored
|
@ -107,7 +107,7 @@ steps:
|
|||
displayName: "Run CPU tests"
|
||||
|
||||
- script: |
|
||||
python -m pip install --pre thinc-apple-ops
|
||||
python -m pip install 'spacy[apple]'
|
||||
python -m pytest --pyargs spacy
|
||||
displayName: "Run CPU tests with thinc-apple-ops"
|
||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
||||
|
|
8
.github/workflows/lock.yml
vendored
8
.github/workflows/lock.yml
vendored
|
@ -15,11 +15,11 @@ jobs:
|
|||
action:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: dessant/lock-threads@v3
|
||||
- uses: dessant/lock-threads@v4
|
||||
with:
|
||||
process-only: 'issues'
|
||||
issue-inactive-days: '30'
|
||||
issue-comment: >
|
||||
This thread has been automatically locked since there
|
||||
has not been any recent activity after it was closed.
|
||||
issue-comment: >
|
||||
This thread has been automatically locked since there
|
||||
has not been any recent activity after it was closed.
|
||||
Please open a new issue for related bugs.
|
||||
|
|
|
@ -14,7 +14,7 @@ parsing, **named entity recognition**, **text classification** and more,
|
|||
multi-task learning with pretrained **transformers** like BERT, as well as a
|
||||
production-ready [**training system**](https://spacy.io/usage/training) and easy
|
||||
model packaging, deployment and workflow management. spaCy is commercial
|
||||
open-source software, released under the MIT license.
|
||||
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||
|
||||
💫 **Version 3.4 out now!**
|
||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
@ -46,6 +46,7 @@ open-source software, released under the MIT license.
|
|||
| 🛠 **[Changelog]** | Changes and version history. |
|
||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
||||
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
||||
|
||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||
[new in v3.0]: https://spacy.io/usage/v3
|
||||
|
@ -59,6 +60,7 @@ open-source software, released under the MIT license.
|
|||
[changelog]: https://spacy.io/usage#changelog
|
||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||
|
||||
|
||||
## 💬 Where to ask questions
|
||||
|
||||
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
||||
|
|
|
@ -41,7 +41,7 @@ jobs:
|
|||
matrix:
|
||||
# We're only running one platform per Python version to speed up builds
|
||||
Python36Linux:
|
||||
imageName: "ubuntu-latest"
|
||||
imageName: "ubuntu-20.04"
|
||||
python.version: "3.6"
|
||||
# Python36Windows:
|
||||
# imageName: "windows-latest"
|
||||
|
@ -50,7 +50,7 @@ jobs:
|
|||
# imageName: "macos-latest"
|
||||
# python.version: "3.6"
|
||||
# Python37Linux:
|
||||
# imageName: "ubuntu-latest"
|
||||
# imageName: "ubuntu-20.04"
|
||||
# python.version: "3.7"
|
||||
Python37Windows:
|
||||
imageName: "windows-latest"
|
||||
|
|
|
@ -6,7 +6,7 @@ preshed>=3.0.2,<3.1.0
|
|||
thinc>=8.1.0,<8.2.0
|
||||
ml_datasets>=0.2.0,<0.3.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.9.1,<1.1.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
typer>=0.3.0,<0.8.0
|
||||
|
|
|
@ -47,7 +47,7 @@ install_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
wasabi>=0.9.1,<1.1.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
# Third-party dependencies
|
||||
|
|
|
@ -158,15 +158,15 @@ def load_project_config(
|
|||
sys.exit(1)
|
||||
validate_project_version(config)
|
||||
validate_project_commands(config)
|
||||
if interpolate:
|
||||
err = f"{PROJECT_FILE} validation error"
|
||||
with show_validation_error(title=err, hint_fill=False):
|
||||
config = substitute_project_variables(config, overrides)
|
||||
# Make sure directories defined in config exist
|
||||
for subdir in config.get("directories", []):
|
||||
dir_path = path / subdir
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir(parents=True)
|
||||
if interpolate:
|
||||
err = f"{PROJECT_FILE} validation error"
|
||||
with show_validation_error(title=err, hint_fill=False):
|
||||
config = substitute_project_variables(config, overrides)
|
||||
return config
|
||||
|
||||
|
||||
|
|
|
@ -101,8 +101,8 @@ def project_run(
|
|||
if not (project_dir / dep).exists():
|
||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
||||
err_kwargs = {"exits": 1} if not dry else {}
|
||||
msg.fail(err, err_help, **err_kwargs)
|
||||
err_exits = 1 if not dry else None
|
||||
msg.fail(err, err_help, exits=err_exits)
|
||||
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
||||
with working_dir(project_dir) as current_dir:
|
||||
msg.divider(subcommand)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{# This is a template for training configs used for the quickstart widget in
|
||||
the docs and the init config command. It encodes various best practices and
|
||||
can help generate the best possible configuration, given a user's requirements. #}
|
||||
{%- set use_transformer = hardware != "cpu" -%}
|
||||
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
|
||||
[paths]
|
||||
|
|
|
@ -345,6 +345,11 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"clear the existing vectors and resize the table.")
|
||||
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
||||
"to end with the attribute {attr}. Got: {bad_attr}.")
|
||||
E079 = ("Error computing states in beam: number of predicted beams "
|
||||
"({pbeams}) does not equal number of gold beams ({gbeams}).")
|
||||
E080 = ("Duplicate state found in beam: {key}.")
|
||||
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
|
||||
"does not equal number of losses ({losses}).")
|
||||
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
||||
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
||||
"match.")
|
||||
|
|
|
@ -328,9 +328,9 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
|
||||
tree = dict(tree)
|
||||
if "orig" in tree:
|
||||
tree["orig"] = self.vocab.strings[tree["orig"]]
|
||||
tree["orig"] = self.vocab.strings.add(tree["orig"])
|
||||
if "orig" in tree:
|
||||
tree["subst"] = self.vocab.strings[tree["subst"]]
|
||||
tree["subst"] = self.vocab.strings.add(tree["subst"])
|
||||
|
||||
trees.append(tree)
|
||||
|
||||
|
|
|
@ -272,7 +272,10 @@ class SpanCategorizer(TrainablePipe):
|
|||
DOCS: https://spacy.io/api/spancategorizer#predict
|
||||
"""
|
||||
indices = self.suggester(docs, ops=self.model.ops)
|
||||
scores = self.model.predict((docs, indices)) # type: ignore
|
||||
if indices.lengths.sum() == 0:
|
||||
scores = self.model.ops.alloc2f(0, 0)
|
||||
else:
|
||||
scores = self.model.predict((docs, indices)) # type: ignore
|
||||
return indices, scores
|
||||
|
||||
def set_candidates(
|
||||
|
|
|
@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
|
|||
|
||||
# head before start
|
||||
arr = doc.to_array(["HEAD"])
|
||||
arr[0] = -1
|
||||
arr[0] = numpy.int32(-1).astype(numpy.uint64)
|
||||
doc_from_array = Doc(en_vocab, words=words)
|
||||
with pytest.raises(ValueError):
|
||||
doc_from_array.from_array(["HEAD"], arr)
|
||||
|
||||
# head after end
|
||||
arr = doc.to_array(["HEAD"])
|
||||
arr[0] = 5
|
||||
arr[0] = numpy.int32(5).astype(numpy.uint64)
|
||||
doc_from_array = Doc(en_vocab, words=words)
|
||||
with pytest.raises(ValueError):
|
||||
doc_from_array.from_array(["HEAD"], arr)
|
||||
|
|
|
@ -60,10 +60,45 @@ def test_initialize_from_labels():
|
|||
nlp2 = Language()
|
||||
lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
|
||||
lemmatizer2.initialize(
|
||||
get_examples=lambda: train_examples,
|
||||
# We want to check that the strings in replacement nodes are
|
||||
# added to the string store. Avoid that they get added through
|
||||
# the examples.
|
||||
get_examples=lambda: train_examples[:1],
|
||||
labels=lemmatizer.label_data,
|
||||
)
|
||||
assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
|
||||
assert lemmatizer2.label_data == {
|
||||
"trees": [
|
||||
{"orig": "S", "subst": "s"},
|
||||
{
|
||||
"prefix_len": 1,
|
||||
"suffix_len": 0,
|
||||
"prefix_tree": 0,
|
||||
"suffix_tree": 4294967295,
|
||||
},
|
||||
{"orig": "s", "subst": ""},
|
||||
{
|
||||
"prefix_len": 0,
|
||||
"suffix_len": 1,
|
||||
"prefix_tree": 4294967295,
|
||||
"suffix_tree": 2,
|
||||
},
|
||||
{
|
||||
"prefix_len": 0,
|
||||
"suffix_len": 0,
|
||||
"prefix_tree": 4294967295,
|
||||
"suffix_tree": 4294967295,
|
||||
},
|
||||
{"orig": "E", "subst": "e"},
|
||||
{
|
||||
"prefix_len": 1,
|
||||
"suffix_len": 0,
|
||||
"prefix_tree": 5,
|
||||
"suffix_tree": 4294967295,
|
||||
},
|
||||
],
|
||||
"labels": (1, 3, 4, 6),
|
||||
}
|
||||
|
||||
|
||||
def test_no_data():
|
||||
|
|
|
@ -372,24 +372,39 @@ def test_overfitting_IO_overlapping():
|
|||
|
||||
|
||||
def test_zero_suggestions():
|
||||
# Test with a suggester that returns 0 suggestions
|
||||
# Test with a suggester that can return 0 suggestions
|
||||
|
||||
@registry.misc("test_zero_suggester")
|
||||
def make_zero_suggester():
|
||||
def zero_suggester(docs, *, ops=None):
|
||||
@registry.misc("test_mixed_zero_suggester")
|
||||
def make_mixed_zero_suggester():
|
||||
def mixed_zero_suggester(docs, *, ops=None):
|
||||
if ops is None:
|
||||
ops = get_current_ops()
|
||||
return Ragged(
|
||||
ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i")
|
||||
)
|
||||
spans = []
|
||||
lengths = []
|
||||
for doc in docs:
|
||||
if len(doc) > 0 and len(doc) % 2 == 0:
|
||||
spans.append((0, 1))
|
||||
lengths.append(1)
|
||||
else:
|
||||
lengths.append(0)
|
||||
spans = ops.asarray2i(spans)
|
||||
lengths_array = ops.asarray1i(lengths)
|
||||
if len(spans) > 0:
|
||||
output = Ragged(ops.xp.vstack(spans), lengths_array)
|
||||
else:
|
||||
output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
|
||||
return output
|
||||
|
||||
return zero_suggester
|
||||
return mixed_zero_suggester
|
||||
|
||||
fix_random_seed(0)
|
||||
nlp = English()
|
||||
spancat = nlp.add_pipe(
|
||||
"spancat",
|
||||
config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY},
|
||||
config={
|
||||
"suggester": {"@misc": "test_mixed_zero_suggester"},
|
||||
"spans_key": SPAN_KEY,
|
||||
},
|
||||
)
|
||||
train_examples = make_examples(nlp)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
@ -397,6 +412,16 @@ def test_zero_suggestions():
|
|||
assert set(spancat.labels) == {"LOC", "PERSON"}
|
||||
|
||||
nlp.update(train_examples, sgd=optimizer)
|
||||
# empty doc
|
||||
nlp("")
|
||||
# single doc with zero suggestions
|
||||
nlp("one")
|
||||
# single doc with one suggestion
|
||||
nlp("two two")
|
||||
# batch with mixed zero/one suggestions
|
||||
list(nlp.pipe(["one", "two two", "three three three", "", "four four four four"]))
|
||||
# batch with no suggestions
|
||||
list(nlp.pipe(["", "one", "three three three"]))
|
||||
|
||||
|
||||
def test_set_candidates():
|
||||
|
|
|
@ -123,6 +123,25 @@ def test_issue7055():
|
|||
assert "model" in filled_cfg["components"]["ner"]
|
||||
|
||||
|
||||
@pytest.mark.issue(11235)
|
||||
def test_issue11235():
|
||||
"""
|
||||
Test that the cli handles interpolation in the directory names correctly when loading project config.
|
||||
"""
|
||||
lang_var = "en"
|
||||
variables = {"lang": lang_var}
|
||||
commands = [{"name": "x", "script": ["hello ${vars.lang}"]}]
|
||||
directories = ["cfg", "${vars.lang}_model"]
|
||||
project = {"commands": commands, "vars": variables, "directories": directories}
|
||||
with make_tempdir() as d:
|
||||
srsly.write_yaml(d / "project.yml", project)
|
||||
cfg = load_project_config(d)
|
||||
# Check that the directories are interpolated and created correctly
|
||||
assert os.path.exists(d / "cfg")
|
||||
assert os.path.exists(d / f"{lang_var}_model")
|
||||
assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
|
||||
|
||||
|
||||
def test_cli_info():
|
||||
nlp = Dutch()
|
||||
nlp.add_pipe("textcat")
|
||||
|
|
|
@ -359,6 +359,7 @@ cdef class Doc:
|
|||
for annot in annotations:
|
||||
if annot:
|
||||
if annot is heads or annot is sent_starts or annot is ent_iobs:
|
||||
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
|
||||
for i in range(len(words)):
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = annot[i]
|
||||
|
@ -1558,6 +1559,7 @@ cdef class Doc:
|
|||
|
||||
for j, (attr, annot) in enumerate(token_annotations.items()):
|
||||
if attr is HEAD:
|
||||
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
|
||||
for i in range(len(words)):
|
||||
array[i, j] = annot[i]
|
||||
elif attr is MORPH:
|
||||
|
|
|
@ -299,7 +299,7 @@ cdef class Span:
|
|||
for ancestor in ancestors:
|
||||
ancestor_i = ancestor.i - self.c.start
|
||||
if ancestor_i in range(length):
|
||||
array[i, head_col] = ancestor_i - i
|
||||
array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
|
||||
|
||||
# if there is no appropriate ancestor, define a new artificial root
|
||||
value = array[i, head_col]
|
||||
|
@ -307,7 +307,7 @@ cdef class Span:
|
|||
new_root = old_to_new_root.get(ancestor_i, None)
|
||||
if new_root is not None:
|
||||
# take the same artificial root as a previous token from the same sentence
|
||||
array[i, head_col] = new_root - i
|
||||
array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
|
||||
else:
|
||||
# set this token as the new artificial root
|
||||
array[i, head_col] = 0
|
||||
|
|
|
@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
|||
if key not in IDS:
|
||||
raise ValueError(Errors.E974.format(obj="token", key=key))
|
||||
elif key in ["ORTH", "SPACY"]:
|
||||
pass
|
||||
continue
|
||||
elif key == "HEAD":
|
||||
attrs.append(key)
|
||||
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
|
||||
row = [h-i if h is not None else 0 for i, h in enumerate(value)]
|
||||
elif key == "DEP":
|
||||
attrs.append(key)
|
||||
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
|
||||
row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
|
||||
elif key == "SENT_START":
|
||||
attrs.append(key)
|
||||
values.append([to_ternary_int(v) for v in value])
|
||||
row = [to_ternary_int(v) for v in value]
|
||||
elif key == "MORPH":
|
||||
attrs.append(key)
|
||||
values.append([vocab.morphology.add(v) for v in value])
|
||||
row = [vocab.morphology.add(v) for v in value]
|
||||
else:
|
||||
attrs.append(key)
|
||||
if not all(isinstance(v, str) for v in value):
|
||||
types = set([type(v) for v in value])
|
||||
raise TypeError(Errors.E969.format(field=key, types=types)) from None
|
||||
values.append([vocab.strings.add(v) for v in value])
|
||||
array = numpy.asarray(values, dtype="uint64")
|
||||
row = [vocab.strings.add(v) for v in value]
|
||||
values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
|
||||
array = numpy.array(values, dtype=numpy.uint64)
|
||||
return attrs, array.T
|
||||
|
||||
|
||||
|
|
|
@ -1643,7 +1643,9 @@ def _pipe(
|
|||
docs: Iterable["Doc"],
|
||||
proc: "PipeCallable",
|
||||
name: str,
|
||||
default_error_handler: Callable[[str, "PipeCallable", List["Doc"], Exception], NoReturn],
|
||||
default_error_handler: Callable[
|
||||
[str, "PipeCallable", List["Doc"], Exception], NoReturn
|
||||
],
|
||||
kwargs: Mapping[str, Any],
|
||||
) -> Iterator["Doc"]:
|
||||
if hasattr(proc, "pipe"):
|
||||
|
|
|
@ -1004,6 +1004,54 @@ This method was previously available as `spacy.gold.spans_from_biluo_tags`.
|
|||
| `tags` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
|
||||
| **RETURNS** | A sequence of `Span` objects with added entity labels. ~~List[Span]~~ |
|
||||
|
||||
### training.biluo_to_iob {#biluo_to_iob tag="function"}
|
||||
|
||||
Convert a sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags to
|
||||
[IOB](/usage/linguistic-features#accessing-ner) tags. This is useful if you want
|
||||
use the BILUO tags with a model that only supports IOB tags.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.training import biluo_to_iob
|
||||
>
|
||||
> tags = ["O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||
> iob_tags = biluo_to_iob(tags)
|
||||
> assert iob_tags == ["O", "O", "B-LOC", "I-LOC", "I-LOC", "O"]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------------------- |
|
||||
| `tags` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | A list of [IOB](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ |
|
||||
|
||||
### training.iob_to_biluo {#iob_to_biluo tag="function"}
|
||||
|
||||
Convert a sequence of [IOB](/usage/linguistic-features#accessing-ner) tags to
|
||||
[BILUO](/usage/linguistic-features#accessing-ner) tags. This is useful if you
|
||||
want use the IOB tags with a model that only supports BILUO tags.
|
||||
|
||||
<Infobox title="Changed in v3.0" variant="warning" id="iob_to_biluo">
|
||||
|
||||
This method was previously available as `spacy.gold.iob_to_biluo`.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.training import iob_to_biluo
|
||||
>
|
||||
> tags = ["O", "O", "B-LOC", "I-LOC", "O"]
|
||||
> biluo_tags = iob_to_biluo(tags)
|
||||
> assert biluo_tags == ["O", "O", "B-LOC", "L-LOC", "O"]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------- |
|
||||
| `tags` | A sequence of [IOB](/usage/linguistic-features#accessing-ner) tags. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | A list of [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ |
|
||||
|
||||
## Utility functions {#util source="spacy/util.py"}
|
||||
|
||||
spaCy comes with a small collection of utility functions located in
|
||||
|
|
|
@ -308,14 +308,14 @@ Load state from a binary string.
|
|||
> assert type(PERSON) == int
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `strings` | A table managing the string-to-int mapping. ~~StringStore~~ |
|
||||
| `vectors` | A table associating word IDs to word vectors. ~~Vectors~~ |
|
||||
| `vectors_length` | Number of dimensions for each word vector. ~~int~~ |
|
||||
| `lookups` | The available lookup tables in this vocab. ~~Lookups~~ |
|
||||
| `writing_system` | A dict with information about the language's writing system. ~~Dict[str, Any]~~ |
|
||||
| `get_noun_chunks` <Tag variant="new">3.0</Tag> | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
|
||||
| Name | Description |
|
||||
| ---------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `strings` | A table managing the string-to-int mapping. ~~StringStore~~ |
|
||||
| `vectors` | A table associating word IDs to word vectors. ~~Vectors~~ |
|
||||
| `vectors_length` | Number of dimensions for each word vector. ~~int~~ |
|
||||
| `lookups` | The available lookup tables in this vocab. ~~Lookups~~ |
|
||||
| `writing_system` | A dict with information about the language's writing system. ~~Dict[str, Any]~~ |
|
||||
| `get_noun_chunks` <Tag variant="new">3.0</Tag> | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -66,8 +66,8 @@ The English CNN pipelines have new word vectors:
|
|||
| Package | Model Version | TAG | Parser LAS | NER F |
|
||||
| ----------------------------------------------- | ------------- | ---: | ---------: | ----: |
|
||||
| [`en_core_web_md`](/models/en#en_core_web_md) | v3.3.0 | 97.3 | 90.1 | 84.6 |
|
||||
| [`en_core_web_md`](/models/en#en_core_web_lg) | v3.4.0 | 97.2 | 90.3 | 85.5 |
|
||||
| [`en_core_web_lg`](/models/en#en_core_web_md) | v3.3.0 | 97.4 | 90.1 | 85.3 |
|
||||
| [`en_core_web_md`](/models/en#en_core_web_md) | v3.4.0 | 97.2 | 90.3 | 85.5 |
|
||||
| [`en_core_web_lg`](/models/en#en_core_web_lg) | v3.3.0 | 97.4 | 90.1 | 85.3 |
|
||||
| [`en_core_web_lg`](/models/en#en_core_web_lg) | v3.4.0 | 97.3 | 90.2 | 85.6 |
|
||||
|
||||
## Notes about upgrading from v3.3 {#upgrading}
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
{ "text": "v2.x Documentation", "url": "https://v2.spacy.io" },
|
||||
{
|
||||
"text": "Custom Solutions",
|
||||
"url": "https://explosion.ai/spacy-tailored-pipelines"
|
||||
"url": "https://explosion.ai/custom-solutions"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -51,7 +51,7 @@
|
|||
{ "text": "Online Course", "url": "https://course.spacy.io" },
|
||||
{
|
||||
"text": "Custom Solutions",
|
||||
"url": "https://explosion.ai/spacy-tailored-pipelines"
|
||||
"url": "https://explosion.ai/custom-solutions"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
|
|
@ -1023,25 +1023,6 @@
|
|||
},
|
||||
"category": ["pipeline"]
|
||||
},
|
||||
{
|
||||
"id": "spacy-sentence-segmenter",
|
||||
"title": "Sentence Segmenter",
|
||||
"slogan": "Custom sentence segmentation for spaCy",
|
||||
"code_example": [
|
||||
"from seg.newline.segmenter import NewLineSegmenter",
|
||||
"import spacy",
|
||||
"",
|
||||
"nlseg = NewLineSegmenter()",
|
||||
"nlp = spacy.load('en')",
|
||||
"nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter', before='parser')",
|
||||
"doc = nlp(my_doc_text)"
|
||||
],
|
||||
"author": "tc64",
|
||||
"author_links": {
|
||||
"github": "tc64"
|
||||
},
|
||||
"category": ["pipeline"]
|
||||
},
|
||||
{
|
||||
"id": "spacy_cld",
|
||||
"title": "spaCy-CLD",
|
||||
|
@ -1468,13 +1449,26 @@
|
|||
"image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png",
|
||||
"code_example": [
|
||||
"import spacy",
|
||||
"import scattertext as st",
|
||||
"",
|
||||
"nlp = spacy.load('en')",
|
||||
"corpus = st.CorpusFromPandas(convention_df,",
|
||||
" category_col='party',",
|
||||
" text_col='text',",
|
||||
" nlp=nlp).build()"
|
||||
"from scattertext import SampleCorpora, produce_scattertext_explorer",
|
||||
"from scattertext import produce_scattertext_html",
|
||||
"from scattertext.CorpusFromPandas import CorpusFromPandas",
|
||||
"",
|
||||
"nlp = spacy.load('en_core_web_sm')",
|
||||
"convention_df = SampleCorpora.ConventionData2012.get_data()",
|
||||
"corpus = CorpusFromPandas(convention_df,",
|
||||
" category_col='party',",
|
||||
" text_col='text',",
|
||||
" nlp=nlp).build()",
|
||||
"",
|
||||
"html = produce_scattertext_html(corpus,",
|
||||
" category='democrat',",
|
||||
" category_name='Democratic',",
|
||||
" not_category_name='Republican',",
|
||||
" minimum_term_frequency=5,",
|
||||
" width_in_pixels=1000)",
|
||||
"open('./simple.html', 'wb').write(html.encode('utf-8'))",
|
||||
"print('Open ./simple.html in Chrome or Firefox.')"
|
||||
],
|
||||
"author": "Jason Kessler",
|
||||
"author_links": {
|
||||
|
|
|
@ -105,13 +105,13 @@ const Landing = ({ data }) => {
|
|||
|
||||
<LandingBannerGrid>
|
||||
<LandingBanner
|
||||
to="https://explosion.ai/spacy-tailored-pipelines"
|
||||
to="https://explosion.ai/custom-solutions"
|
||||
button="Learn more"
|
||||
background="#E4F4F9"
|
||||
color="#1e1935"
|
||||
small
|
||||
>
|
||||
<Link to="https://explosion.ai/spacy-tailored-pipelines" hidden>
|
||||
<Link to="https://explosion.ai/custom-solutions" hidden>
|
||||
<img src={tailoredPipelinesImage} alt="spaCy Tailored Pipelines" />
|
||||
</Link>
|
||||
<strong>
|
||||
|
|
Loading…
Reference in New Issue
Block a user