Merge branch 'master' into feature/candidate-generation-by-docs

This commit is contained in:
Raphael Mitsch 2022-12-12 13:46:07 +01:00
commit 77680421b4
27 changed files with 217 additions and 81 deletions

View File

@ -107,7 +107,7 @@ steps:
displayName: "Run CPU tests" displayName: "Run CPU tests"
- script: | - script: |
python -m pip install --pre thinc-apple-ops python -m pip install 'spacy[apple]'
python -m pytest --pyargs spacy python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops" displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11')) condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))

View File

@ -15,11 +15,11 @@ jobs:
action: action:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: dessant/lock-threads@v3 - uses: dessant/lock-threads@v4
with: with:
process-only: 'issues' process-only: 'issues'
issue-inactive-days: '30' issue-inactive-days: '30'
issue-comment: > issue-comment: >
This thread has been automatically locked since there This thread has been automatically locked since there
has not been any recent activity after it was closed. has not been any recent activity after it was closed.
Please open a new issue for related bugs. Please open a new issue for related bugs.

View File

@ -14,7 +14,7 @@ parsing, **named entity recognition**, **text classification** and more,
multi-task learning with pretrained **transformers** like BERT, as well as a multi-task learning with pretrained **transformers** like BERT, as well as a
production-ready [**training system**](https://spacy.io/usage/training) and easy production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial model packaging, deployment and workflow management. spaCy is commercial
open-source software, released under the MIT license. open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
💫 **Version 3.4 out now!** 💫 **Version 3.4 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases) [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
@ -46,6 +46,7 @@ open-source software, released under the MIT license.
| 🛠 **[Changelog]** | Changes and version history. | | 🛠 **[Changelog]** | Changes and version history. |
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. | | 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-pipelines)** | | <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-pipelines)** |
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-analysis)** |
[spacy 101]: https://spacy.io/usage/spacy-101 [spacy 101]: https://spacy.io/usage/spacy-101
[new in v3.0]: https://spacy.io/usage/v3 [new in v3.0]: https://spacy.io/usage/v3
@ -59,6 +60,7 @@ open-source software, released under the MIT license.
[changelog]: https://spacy.io/usage#changelog [changelog]: https://spacy.io/usage#changelog
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
## 💬 Where to ask questions ## 💬 Where to ask questions
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about). The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).

View File

@ -41,7 +41,7 @@ jobs:
matrix: matrix:
# We're only running one platform per Python version to speed up builds # We're only running one platform per Python version to speed up builds
Python36Linux: Python36Linux:
imageName: "ubuntu-latest" imageName: "ubuntu-20.04"
python.version: "3.6" python.version: "3.6"
# Python36Windows: # Python36Windows:
# imageName: "windows-latest" # imageName: "windows-latest"
@ -50,7 +50,7 @@ jobs:
# imageName: "macos-latest" # imageName: "macos-latest"
# python.version: "3.6" # python.version: "3.6"
# Python37Linux: # Python37Linux:
# imageName: "ubuntu-latest" # imageName: "ubuntu-20.04"
# python.version: "3.7" # python.version: "3.7"
Python37Windows: Python37Windows:
imageName: "windows-latest" imageName: "windows-latest"

View File

@ -6,7 +6,7 @@ preshed>=3.0.2,<3.1.0
thinc>=8.1.0,<8.2.0 thinc>=8.1.0,<8.2.0
ml_datasets>=0.2.0,<0.3.0 ml_datasets>=0.2.0,<0.3.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.1.0 wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.8.0 typer>=0.3.0,<0.8.0

View File

@ -47,7 +47,7 @@ install_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.1.0,<8.2.0 thinc>=8.1.0,<8.2.0
wasabi>=0.9.1,<1.1.0 wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
# Third-party dependencies # Third-party dependencies

View File

@ -158,15 +158,15 @@ def load_project_config(
sys.exit(1) sys.exit(1)
validate_project_version(config) validate_project_version(config)
validate_project_commands(config) validate_project_commands(config)
if interpolate:
err = f"{PROJECT_FILE} validation error"
with show_validation_error(title=err, hint_fill=False):
config = substitute_project_variables(config, overrides)
# Make sure directories defined in config exist # Make sure directories defined in config exist
for subdir in config.get("directories", []): for subdir in config.get("directories", []):
dir_path = path / subdir dir_path = path / subdir
if not dir_path.exists(): if not dir_path.exists():
dir_path.mkdir(parents=True) dir_path.mkdir(parents=True)
if interpolate:
err = f"{PROJECT_FILE} validation error"
with show_validation_error(title=err, hint_fill=False):
config = substitute_project_variables(config, overrides)
return config return config

View File

@ -101,8 +101,8 @@ def project_run(
if not (project_dir / dep).exists(): if not (project_dir / dep).exists():
err = f"Missing dependency specified by command '{subcommand}': {dep}" err = f"Missing dependency specified by command '{subcommand}': {dep}"
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?" err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
err_kwargs = {"exits": 1} if not dry else {} err_exits = 1 if not dry else None
msg.fail(err, err_help, **err_kwargs) msg.fail(err, err_help, exits=err_exits)
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION) check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
with working_dir(project_dir) as current_dir: with working_dir(project_dir) as current_dir:
msg.divider(subcommand) msg.divider(subcommand)

View File

@ -1,7 +1,7 @@
{# This is a template for training configs used for the quickstart widget in {# This is a template for training configs used for the quickstart widget in
the docs and the init config command. It encodes various best practices and the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #} can help generate the best possible configuration, given a user's requirements. #}
{%- set use_transformer = hardware != "cpu" -%} {%- set use_transformer = hardware != "cpu" and transformer_data -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%} {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%} {%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
[paths] [paths]

View File

@ -345,6 +345,11 @@ class Errors(metaclass=ErrorsWithCodes):
"clear the existing vectors and resize the table.") "clear the existing vectors and resize the table.")
E074 = ("Error interpreting compiled match pattern: patterns are expected " E074 = ("Error interpreting compiled match pattern: patterns are expected "
"to end with the attribute {attr}. Got: {bad_attr}.") "to end with the attribute {attr}. Got: {bad_attr}.")
E079 = ("Error computing states in beam: number of predicted beams "
"({pbeams}) does not equal number of gold beams ({gbeams}).")
E080 = ("Duplicate state found in beam: {key}.")
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
"does not equal number of losses ({losses}).")
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), " E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not " "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
"match.") "match.")

View File

@ -328,9 +328,9 @@ class EditTreeLemmatizer(TrainablePipe):
tree = dict(tree) tree = dict(tree)
if "orig" in tree: if "orig" in tree:
tree["orig"] = self.vocab.strings[tree["orig"]] tree["orig"] = self.vocab.strings.add(tree["orig"])
if "orig" in tree: if "orig" in tree:
tree["subst"] = self.vocab.strings[tree["subst"]] tree["subst"] = self.vocab.strings.add(tree["subst"])
trees.append(tree) trees.append(tree)

View File

@ -272,7 +272,10 @@ class SpanCategorizer(TrainablePipe):
DOCS: https://spacy.io/api/spancategorizer#predict DOCS: https://spacy.io/api/spancategorizer#predict
""" """
indices = self.suggester(docs, ops=self.model.ops) indices = self.suggester(docs, ops=self.model.ops)
scores = self.model.predict((docs, indices)) # type: ignore if indices.lengths.sum() == 0:
scores = self.model.ops.alloc2f(0, 0)
else:
scores = self.model.predict((docs, indices)) # type: ignore
return indices, scores return indices, scores
def set_candidates( def set_candidates(

View File

@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
# head before start # head before start
arr = doc.to_array(["HEAD"]) arr = doc.to_array(["HEAD"])
arr[0] = -1 arr[0] = numpy.int32(-1).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words) doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError): with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr) doc_from_array.from_array(["HEAD"], arr)
# head after end # head after end
arr = doc.to_array(["HEAD"]) arr = doc.to_array(["HEAD"])
arr[0] = 5 arr[0] = numpy.int32(5).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words) doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError): with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr) doc_from_array.from_array(["HEAD"], arr)

View File

@ -60,10 +60,45 @@ def test_initialize_from_labels():
nlp2 = Language() nlp2 = Language()
lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer") lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
lemmatizer2.initialize( lemmatizer2.initialize(
get_examples=lambda: train_examples, # We want to check that the strings in replacement nodes are
# added to the string store. Avoid that they get added through
# the examples.
get_examples=lambda: train_examples[:1],
labels=lemmatizer.label_data, labels=lemmatizer.label_data,
) )
assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3} assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
assert lemmatizer2.label_data == {
"trees": [
{"orig": "S", "subst": "s"},
{
"prefix_len": 1,
"suffix_len": 0,
"prefix_tree": 0,
"suffix_tree": 4294967295,
},
{"orig": "s", "subst": ""},
{
"prefix_len": 0,
"suffix_len": 1,
"prefix_tree": 4294967295,
"suffix_tree": 2,
},
{
"prefix_len": 0,
"suffix_len": 0,
"prefix_tree": 4294967295,
"suffix_tree": 4294967295,
},
{"orig": "E", "subst": "e"},
{
"prefix_len": 1,
"suffix_len": 0,
"prefix_tree": 5,
"suffix_tree": 4294967295,
},
],
"labels": (1, 3, 4, 6),
}
def test_no_data(): def test_no_data():

View File

@ -372,24 +372,39 @@ def test_overfitting_IO_overlapping():
def test_zero_suggestions(): def test_zero_suggestions():
# Test with a suggester that returns 0 suggestions # Test with a suggester that can return 0 suggestions
@registry.misc("test_zero_suggester") @registry.misc("test_mixed_zero_suggester")
def make_zero_suggester(): def make_mixed_zero_suggester():
def zero_suggester(docs, *, ops=None): def mixed_zero_suggester(docs, *, ops=None):
if ops is None: if ops is None:
ops = get_current_ops() ops = get_current_ops()
return Ragged( spans = []
ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i") lengths = []
) for doc in docs:
if len(doc) > 0 and len(doc) % 2 == 0:
spans.append((0, 1))
lengths.append(1)
else:
lengths.append(0)
spans = ops.asarray2i(spans)
lengths_array = ops.asarray1i(lengths)
if len(spans) > 0:
output = Ragged(ops.xp.vstack(spans), lengths_array)
else:
output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
return output
return zero_suggester return mixed_zero_suggester
fix_random_seed(0) fix_random_seed(0)
nlp = English() nlp = English()
spancat = nlp.add_pipe( spancat = nlp.add_pipe(
"spancat", "spancat",
config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY}, config={
"suggester": {"@misc": "test_mixed_zero_suggester"},
"spans_key": SPAN_KEY,
},
) )
train_examples = make_examples(nlp) train_examples = make_examples(nlp)
optimizer = nlp.initialize(get_examples=lambda: train_examples) optimizer = nlp.initialize(get_examples=lambda: train_examples)
@ -397,6 +412,16 @@ def test_zero_suggestions():
assert set(spancat.labels) == {"LOC", "PERSON"} assert set(spancat.labels) == {"LOC", "PERSON"}
nlp.update(train_examples, sgd=optimizer) nlp.update(train_examples, sgd=optimizer)
# empty doc
nlp("")
# single doc with zero suggestions
nlp("one")
# single doc with one suggestion
nlp("two two")
# batch with mixed zero/one suggestions
list(nlp.pipe(["one", "two two", "three three three", "", "four four four four"]))
# batch with no suggestions
list(nlp.pipe(["", "one", "three three three"]))
def test_set_candidates(): def test_set_candidates():

View File

@ -123,6 +123,25 @@ def test_issue7055():
assert "model" in filled_cfg["components"]["ner"] assert "model" in filled_cfg["components"]["ner"]
@pytest.mark.issue(11235)
def test_issue11235():
"""
Test that the cli handles interpolation in the directory names correctly when loading project config.
"""
lang_var = "en"
variables = {"lang": lang_var}
commands = [{"name": "x", "script": ["hello ${vars.lang}"]}]
directories = ["cfg", "${vars.lang}_model"]
project = {"commands": commands, "vars": variables, "directories": directories}
with make_tempdir() as d:
srsly.write_yaml(d / "project.yml", project)
cfg = load_project_config(d)
# Check that the directories are interpolated and created correctly
assert os.path.exists(d / "cfg")
assert os.path.exists(d / f"{lang_var}_model")
assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
def test_cli_info(): def test_cli_info():
nlp = Dutch() nlp = Dutch()
nlp.add_pipe("textcat") nlp.add_pipe("textcat")

View File

@ -359,6 +359,7 @@ cdef class Doc:
for annot in annotations: for annot in annotations:
if annot: if annot:
if annot is heads or annot is sent_starts or annot is ent_iobs: if annot is heads or annot is sent_starts or annot is ent_iobs:
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
for i in range(len(words)): for i in range(len(words)):
if attrs.ndim == 1: if attrs.ndim == 1:
attrs[i] = annot[i] attrs[i] = annot[i]
@ -1558,6 +1559,7 @@ cdef class Doc:
for j, (attr, annot) in enumerate(token_annotations.items()): for j, (attr, annot) in enumerate(token_annotations.items()):
if attr is HEAD: if attr is HEAD:
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
for i in range(len(words)): for i in range(len(words)):
array[i, j] = annot[i] array[i, j] = annot[i]
elif attr is MORPH: elif attr is MORPH:

View File

@ -299,7 +299,7 @@ cdef class Span:
for ancestor in ancestors: for ancestor in ancestors:
ancestor_i = ancestor.i - self.c.start ancestor_i = ancestor.i - self.c.start
if ancestor_i in range(length): if ancestor_i in range(length):
array[i, head_col] = ancestor_i - i array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
# if there is no appropriate ancestor, define a new artificial root # if there is no appropriate ancestor, define a new artificial root
value = array[i, head_col] value = array[i, head_col]
@ -307,7 +307,7 @@ cdef class Span:
new_root = old_to_new_root.get(ancestor_i, None) new_root = old_to_new_root.get(ancestor_i, None)
if new_root is not None: if new_root is not None:
# take the same artificial root as a previous token from the same sentence # take the same artificial root as a previous token from the same sentence
array[i, head_col] = new_root - i array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
else: else:
# set this token as the new artificial root # set this token as the new artificial root
array[i, head_col] = 0 array[i, head_col] = 0

View File

@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
if key not in IDS: if key not in IDS:
raise ValueError(Errors.E974.format(obj="token", key=key)) raise ValueError(Errors.E974.format(obj="token", key=key))
elif key in ["ORTH", "SPACY"]: elif key in ["ORTH", "SPACY"]:
pass continue
elif key == "HEAD": elif key == "HEAD":
attrs.append(key) attrs.append(key)
values.append([h-i if h is not None else 0 for i, h in enumerate(value)]) row = [h-i if h is not None else 0 for i, h in enumerate(value)]
elif key == "DEP": elif key == "DEP":
attrs.append(key) attrs.append(key)
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]) row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
elif key == "SENT_START": elif key == "SENT_START":
attrs.append(key) attrs.append(key)
values.append([to_ternary_int(v) for v in value]) row = [to_ternary_int(v) for v in value]
elif key == "MORPH": elif key == "MORPH":
attrs.append(key) attrs.append(key)
values.append([vocab.morphology.add(v) for v in value]) row = [vocab.morphology.add(v) for v in value]
else: else:
attrs.append(key) attrs.append(key)
if not all(isinstance(v, str) for v in value): if not all(isinstance(v, str) for v in value):
types = set([type(v) for v in value]) types = set([type(v) for v in value])
raise TypeError(Errors.E969.format(field=key, types=types)) from None raise TypeError(Errors.E969.format(field=key, types=types)) from None
values.append([vocab.strings.add(v) for v in value]) row = [vocab.strings.add(v) for v in value]
array = numpy.asarray(values, dtype="uint64") values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
array = numpy.array(values, dtype=numpy.uint64)
return attrs, array.T return attrs, array.T

View File

@ -1643,7 +1643,9 @@ def _pipe(
docs: Iterable["Doc"], docs: Iterable["Doc"],
proc: "PipeCallable", proc: "PipeCallable",
name: str, name: str,
default_error_handler: Callable[[str, "PipeCallable", List["Doc"], Exception], NoReturn], default_error_handler: Callable[
[str, "PipeCallable", List["Doc"], Exception], NoReturn
],
kwargs: Mapping[str, Any], kwargs: Mapping[str, Any],
) -> Iterator["Doc"]: ) -> Iterator["Doc"]:
if hasattr(proc, "pipe"): if hasattr(proc, "pipe"):

View File

@ -1004,6 +1004,54 @@ This method was previously available as `spacy.gold.spans_from_biluo_tags`.
| `tags` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | | `tags` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
| **RETURNS** | A sequence of `Span` objects with added entity labels. ~~List[Span]~~ | | **RETURNS** | A sequence of `Span` objects with added entity labels. ~~List[Span]~~ |
### training.biluo_to_iob {#biluo_to_iob tag="function"}
Convert a sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags to
[IOB](/usage/linguistic-features#accessing-ner) tags. This is useful if you want
use the BILUO tags with a model that only supports IOB tags.
> #### Example
>
> ```python
> from spacy.training import biluo_to_iob
>
> tags = ["O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
> iob_tags = biluo_to_iob(tags)
> assert iob_tags == ["O", "O", "B-LOC", "I-LOC", "I-LOC", "O"]
> ```
| Name | Description |
| ----------- | --------------------------------------------------------------------------------------- |
| `tags` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~Iterable[str]~~ |
| **RETURNS** | A list of [IOB](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ |
### training.iob_to_biluo {#iob_to_biluo tag="function"}
Convert a sequence of [IOB](/usage/linguistic-features#accessing-ner) tags to
[BILUO](/usage/linguistic-features#accessing-ner) tags. This is useful if you
want use the IOB tags with a model that only supports BILUO tags.
<Infobox title="Changed in v3.0" variant="warning" id="iob_to_biluo">
This method was previously available as `spacy.gold.iob_to_biluo`.
</Infobox>
> #### Example
>
> ```python
> from spacy.training import iob_to_biluo
>
> tags = ["O", "O", "B-LOC", "I-LOC", "O"]
> biluo_tags = iob_to_biluo(tags)
> assert biluo_tags == ["O", "O", "B-LOC", "L-LOC", "O"]
> ```
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------- |
| `tags` | A sequence of [IOB](/usage/linguistic-features#accessing-ner) tags. ~~Iterable[str]~~ |
| **RETURNS** | A list of [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ |
## Utility functions {#util source="spacy/util.py"} ## Utility functions {#util source="spacy/util.py"}
spaCy comes with a small collection of utility functions located in spaCy comes with a small collection of utility functions located in

View File

@ -308,14 +308,14 @@ Load state from a binary string.
> assert type(PERSON) == int > assert type(PERSON) == int
> ``` > ```
| Name | Description | | Name | Description |
| ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `strings` | A table managing the string-to-int mapping. ~~StringStore~~ | | `strings` | A table managing the string-to-int mapping. ~~StringStore~~ |
| `vectors` | A table associating word IDs to word vectors. ~~Vectors~~ | | `vectors` | A table associating word IDs to word vectors. ~~Vectors~~ |
| `vectors_length` | Number of dimensions for each word vector. ~~int~~ | | `vectors_length` | Number of dimensions for each word vector. ~~int~~ |
| `lookups` | The available lookup tables in this vocab. ~~Lookups~~ | | `lookups` | The available lookup tables in this vocab. ~~Lookups~~ |
| `writing_system` | A dict with information about the language's writing system. ~~Dict[str, Any]~~ | | `writing_system` | A dict with information about the language's writing system. ~~Dict[str, Any]~~ |
| `get_noun_chunks` <Tag variant="new">3.0</Tag> | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ | | `get_noun_chunks` <Tag variant="new">3.0</Tag> | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -66,8 +66,8 @@ The English CNN pipelines have new word vectors:
| Package | Model Version | TAG | Parser LAS | NER F | | Package | Model Version | TAG | Parser LAS | NER F |
| ----------------------------------------------- | ------------- | ---: | ---------: | ----: | | ----------------------------------------------- | ------------- | ---: | ---------: | ----: |
| [`en_core_web_md`](/models/en#en_core_web_md) | v3.3.0 | 97.3 | 90.1 | 84.6 | | [`en_core_web_md`](/models/en#en_core_web_md) | v3.3.0 | 97.3 | 90.1 | 84.6 |
| [`en_core_web_md`](/models/en#en_core_web_lg) | v3.4.0 | 97.2 | 90.3 | 85.5 | | [`en_core_web_md`](/models/en#en_core_web_md) | v3.4.0 | 97.2 | 90.3 | 85.5 |
| [`en_core_web_lg`](/models/en#en_core_web_md) | v3.3.0 | 97.4 | 90.1 | 85.3 | | [`en_core_web_lg`](/models/en#en_core_web_lg) | v3.3.0 | 97.4 | 90.1 | 85.3 |
| [`en_core_web_lg`](/models/en#en_core_web_lg) | v3.4.0 | 97.3 | 90.2 | 85.6 | | [`en_core_web_lg`](/models/en#en_core_web_lg) | v3.4.0 | 97.3 | 90.2 | 85.6 |
## Notes about upgrading from v3.3 {#upgrading} ## Notes about upgrading from v3.3 {#upgrading}

View File

@ -45,7 +45,7 @@
{ "text": "v2.x Documentation", "url": "https://v2.spacy.io" }, { "text": "v2.x Documentation", "url": "https://v2.spacy.io" },
{ {
"text": "Custom Solutions", "text": "Custom Solutions",
"url": "https://explosion.ai/spacy-tailored-pipelines" "url": "https://explosion.ai/custom-solutions"
} }
] ]
} }

View File

@ -51,7 +51,7 @@
{ "text": "Online Course", "url": "https://course.spacy.io" }, { "text": "Online Course", "url": "https://course.spacy.io" },
{ {
"text": "Custom Solutions", "text": "Custom Solutions",
"url": "https://explosion.ai/spacy-tailored-pipelines" "url": "https://explosion.ai/custom-solutions"
} }
] ]
}, },

View File

@ -1023,25 +1023,6 @@
}, },
"category": ["pipeline"] "category": ["pipeline"]
}, },
{
"id": "spacy-sentence-segmenter",
"title": "Sentence Segmenter",
"slogan": "Custom sentence segmentation for spaCy",
"code_example": [
"from seg.newline.segmenter import NewLineSegmenter",
"import spacy",
"",
"nlseg = NewLineSegmenter()",
"nlp = spacy.load('en')",
"nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter', before='parser')",
"doc = nlp(my_doc_text)"
],
"author": "tc64",
"author_links": {
"github": "tc64"
},
"category": ["pipeline"]
},
{ {
"id": "spacy_cld", "id": "spacy_cld",
"title": "spaCy-CLD", "title": "spaCy-CLD",
@ -1468,13 +1449,26 @@
"image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png", "image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png",
"code_example": [ "code_example": [
"import spacy", "import spacy",
"import scattertext as st",
"", "",
"nlp = spacy.load('en')", "from scattertext import SampleCorpora, produce_scattertext_explorer",
"corpus = st.CorpusFromPandas(convention_df,", "from scattertext import produce_scattertext_html",
" category_col='party',", "from scattertext.CorpusFromPandas import CorpusFromPandas",
" text_col='text',", "",
" nlp=nlp).build()" "nlp = spacy.load('en_core_web_sm')",
"convention_df = SampleCorpora.ConventionData2012.get_data()",
"corpus = CorpusFromPandas(convention_df,",
" category_col='party',",
" text_col='text',",
" nlp=nlp).build()",
"",
"html = produce_scattertext_html(corpus,",
" category='democrat',",
" category_name='Democratic',",
" not_category_name='Republican',",
" minimum_term_frequency=5,",
" width_in_pixels=1000)",
"open('./simple.html', 'wb').write(html.encode('utf-8'))",
"print('Open ./simple.html in Chrome or Firefox.')"
], ],
"author": "Jason Kessler", "author": "Jason Kessler",
"author_links": { "author_links": {

View File

@ -105,13 +105,13 @@ const Landing = ({ data }) => {
<LandingBannerGrid> <LandingBannerGrid>
<LandingBanner <LandingBanner
to="https://explosion.ai/spacy-tailored-pipelines" to="https://explosion.ai/custom-solutions"
button="Learn more" button="Learn more"
background="#E4F4F9" background="#E4F4F9"
color="#1e1935" color="#1e1935"
small small
> >
<Link to="https://explosion.ai/spacy-tailored-pipelines" hidden> <Link to="https://explosion.ai/custom-solutions" hidden>
<img src={tailoredPipelinesImage} alt="spaCy Tailored Pipelines" /> <img src={tailoredPipelinesImage} alt="spaCy Tailored Pipelines" />
</Link> </Link>
<strong> <strong>