diff --git a/.github/workflows/gputests.yml b/.github/workflows/gputests.yml new file mode 100644 index 000000000..14c1552bf --- /dev/null +++ b/.github/workflows/gputests.yml @@ -0,0 +1,21 @@ +name: Weekly GPU tests + +on: + schedule: + - cron: '0 1 * * MON' + +jobs: + weekly-gputests: + strategy: + fail-fast: false + matrix: + branch: [master, develop, v4] + runs-on: ubuntu-latest + steps: + - name: Trigger buildkite build + uses: buildkite/trigger-pipeline-action@v1.2.0 + env: + PIPELINE: explosion-ai/spacy-slow-gpu-tests + BRANCH: ${{ matrix.branch }} + MESSAGE: ":github: Weekly GPU + slow tests - triggered from a GitHub Action" + BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }} diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml new file mode 100644 index 000000000..3b0f177a7 --- /dev/null +++ b/.github/workflows/slowtests.yml @@ -0,0 +1,35 @@ +name: Daily slow tests + +on: + schedule: + - cron: '0 0 * * *' + +jobs: + daily-slowtests: + strategy: + fail-fast: false + matrix: + branch: [master, develop, v4] + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v1 + - name: Get commits from past 24 hours + id: check_commits + run: | + today=$(date '+%Y-%m-%d %H:%M:%S') + yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S') + if git log --after="$yesterday" --before="$today" | grep commit ; then + echo "::set-output name=run_tests::true" + else + echo "::set-output name=run_tests::false" + fi + + - name: Trigger buildkite build + if: steps.check_commits.outputs.run_tests == 'true' + uses: buildkite/trigger-pipeline-action@v1.2.0 + env: + PIPELINE: explosion-ai/spacy-slow-tests + BRANCH: ${{ matrix.branch }} + MESSAGE: ":github: Daily slow tests - triggered from a GitHub Action" + BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }} diff --git a/README.md b/README.md index 57d76fb45..05c912ffa 100644 --- a/README.md +++ b/README.md @@ -32,19 +32,20 @@ open-source software, released under the MIT license. ## ๐Ÿ“– Documentation -| Documentation | | -| -------------------------- | -------------------------------------------------------------- | -| โญ๏ธ **[spaCy 101]** | New to spaCy? Here's everything you need to know! | -| ๐Ÿ“š **[Usage Guides]** | How to use spaCy and its features. | -| ๐Ÿš€ **[New in v3.0]** | New features, backwards incompatibilities and migration guide. | -| ๐Ÿช **[Project Templates]** | End-to-end workflows you can clone, modify and run. | -| ๐ŸŽ› **[API Reference]** | The detailed reference for spaCy's API. | -| ๐Ÿ“ฆ **[Models]** | Download trained pipelines for spaCy. | -| ๐ŸŒŒ **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. | -| ๐Ÿ‘ฉโ€๐Ÿซ **[Online Course]** | Learn spaCy in this free and interactive online course. | -| ๐Ÿ“บ **[Videos]** | Our YouTube channel with video tutorials, talks and more. | -| ๐Ÿ›  **[Changelog]** | Changes and version history. | -| ๐Ÿ’ **[Contribute]** | How to contribute to the spaCy project and code base. | +| Documentation | | +| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| โญ๏ธ **[spaCy 101]** | New to spaCy? Here's everything you need to know! | +| ๐Ÿ“š **[Usage Guides]** | How to use spaCy and its features. | +| ๐Ÿš€ **[New in v3.0]** | New features, backwards incompatibilities and migration guide. | +| ๐Ÿช **[Project Templates]** | End-to-end workflows you can clone, modify and run. | +| ๐ŸŽ› **[API Reference]** | The detailed reference for spaCy's API. | +| ๐Ÿ“ฆ **[Models]** | Download trained pipelines for spaCy. | +| ๐ŸŒŒ **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. | +| ๐Ÿ‘ฉโ€๐Ÿซ **[Online Course]** | Learn spaCy in this free and interactive online course. | +| ๐Ÿ“บ **[Videos]** | Our YouTube channel with video tutorials, talks and more. | +| ๐Ÿ›  **[Changelog]** | Changes and version history. | +| ๐Ÿ’ **[Contribute]** | How to contribute to the spaCy project and code base. | +| spaCy Tailored Pipelines | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** | [spacy 101]: https://spacy.io/usage/spacy-101 [new in v3.0]: https://spacy.io/usage/v3 @@ -60,9 +61,7 @@ open-source software, released under the MIT license. ## ๐Ÿ’ฌ Where to ask questions -The spaCy project is maintained by **[@honnibal](https://github.com/honnibal)**, -**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)**, -**[@adrianeboyd](https://github.com/adrianeboyd)** and **[@polm](https://github.com/polm)**. +The spaCy project is maintained by the [spaCy team](https://explosion.ai/about). Please understand that we won't be able to provide individual support via email. We also believe that help is much more valuable if it's shared publicly, so that more people can benefit from it. diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 71a793911..4624b2eb2 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -11,12 +11,14 @@ trigger: exclude: - "website/*" - "*.md" + - ".github/workflows/*" pr: - paths: + paths: exclude: - "*.md" - "website/docs/*" - "website/src/*" + - ".github/workflows/*" jobs: # Perform basic checks for most important errors (syntax etc.) Uses the config diff --git a/requirements.txt b/requirements.txt index 8d7372cfe..ca4099be5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,3 +35,4 @@ mypy==0.910 types-dataclasses>=0.1.3; python_version < "3.7" types-mock>=0.1.1 types-requests +black>=22.0,<23.0 diff --git a/spacy/about.py b/spacy/about.py index c253d5052..d01b278c9 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.2.1" +__version__ = "3.2.2" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index ab7c20d48..a63795148 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -193,6 +193,70 @@ def debug_data( else: msg.info("No word vectors present in the package") + if "spancat" in factory_names: + model_labels_spancat = _get_labels_from_spancat(nlp) + has_low_data_warning = False + has_no_neg_warning = False + + msg.divider("Span Categorization") + msg.table(model_labels_spancat, header=["Spans Key", "Labels"], divider=True) + + msg.text("Label counts in train data: ", show=verbose) + for spans_key, data_labels in gold_train_data["spancat"].items(): + msg.text( + f"Key: {spans_key}, {_format_labels(data_labels.items(), counts=True)}", + show=verbose, + ) + # Data checks: only take the spans keys in the actual spancat components + data_labels_in_component = { + spans_key: gold_train_data["spancat"][spans_key] + for spans_key in model_labels_spancat.keys() + } + for spans_key, data_labels in data_labels_in_component.items(): + for label, count in data_labels.items(): + # Check for missing labels + spans_key_in_model = spans_key in model_labels_spancat.keys() + if (spans_key_in_model) and ( + label not in model_labels_spancat[spans_key] + ): + msg.warn( + f"Label '{label}' is not present in the model labels of key '{spans_key}'. " + "Performance may degrade after training." + ) + # Check for low number of examples per label + if count <= NEW_LABEL_THRESHOLD: + msg.warn( + f"Low number of examples for label '{label}' in key '{spans_key}' ({count})" + ) + has_low_data_warning = True + # Check for negative examples + with msg.loading("Analyzing label distribution..."): + neg_docs = _get_examples_without_label( + train_dataset, label, "spancat", spans_key + ) + if neg_docs == 0: + msg.warn(f"No examples for texts WITHOUT new label '{label}'") + has_no_neg_warning = True + + if has_low_data_warning: + msg.text( + f"To train a new span type, your data should include at " + f"least {NEW_LABEL_THRESHOLD} instances of the new label", + show=verbose, + ) + else: + msg.good("Good amount of examples for all labels") + + if has_no_neg_warning: + msg.text( + "Training data should always include examples of spans " + "in context, as well as examples without a given span " + "type.", + show=verbose, + ) + else: + msg.good("Examples without ocurrences available for all labels") + if "ner" in factory_names: # Get all unique NER labels present in the data labels = set( @@ -238,7 +302,7 @@ def debug_data( has_low_data_warning = True with msg.loading("Analyzing label distribution..."): - neg_docs = _get_examples_without_label(train_dataset, label) + neg_docs = _get_examples_without_label(train_dataset, label, "ner") if neg_docs == 0: msg.warn(f"No examples for texts WITHOUT new label '{label}'") has_no_neg_warning = True @@ -573,6 +637,7 @@ def _compile_gold( "deps": Counter(), "words": Counter(), "roots": Counter(), + "spancat": dict(), "ws_ents": 0, "boundary_cross_ents": 0, "n_words": 0, @@ -603,6 +668,7 @@ def _compile_gold( if nlp.vocab.strings[word] not in nlp.vocab.vectors: data["words_missing_vectors"].update([word]) if "ner" in factory_names: + sent_starts = eg.get_aligned_sent_starts() for i, label in enumerate(eg.get_aligned_ner()): if label is None: continue @@ -612,10 +678,19 @@ def _compile_gold( if label.startswith(("B-", "U-")): combined_label = label.split("-")[1] data["ner"][combined_label] += 1 - if gold[i].is_sent_start and label.startswith(("I-", "L-")): + if sent_starts[i] == True and label.startswith(("I-", "L-")): data["boundary_cross_ents"] += 1 elif label == "-": data["ner"]["-"] += 1 + if "spancat" in factory_names: + for span_key in list(eg.reference.spans.keys()): + if span_key not in data["spancat"]: + data["spancat"][span_key] = Counter() + for i, span in enumerate(eg.reference.spans[span_key]): + if span.label_ is None: + continue + else: + data["spancat"][span_key][span.label_] += 1 if "textcat" in factory_names or "textcat_multilabel" in factory_names: data["cats"].update(gold.cats) if any(val not in (0, 1) for val in gold.cats.values()): @@ -686,14 +761,28 @@ def _format_labels( return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)]) -def _get_examples_without_label(data: Sequence[Example], label: str) -> int: +def _get_examples_without_label( + data: Sequence[Example], + label: str, + component: Literal["ner", "spancat"] = "ner", + spans_key: Optional[str] = "sc", +) -> int: count = 0 for eg in data: - labels = [ - label.split("-")[1] - for label in eg.get_aligned_ner() - if label not in ("O", "-", None) - ] + if component == "ner": + labels = [ + label.split("-")[1] + for label in eg.get_aligned_ner() + if label not in ("O", "-", None) + ] + + if component == "spancat": + labels = ( + [span.label_ for span in eg.reference.spans[spans_key]] + if spans_key in eg.reference.spans + else [] + ) + if label not in labels: count += 1 return count diff --git a/spacy/cli/package.py b/spacy/cli/package.py index f9d2a9af2..b8c8397b6 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -7,6 +7,7 @@ from collections import defaultdict from catalogue import RegistryError import srsly import sys +import re from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX from ..schemas import validate, ModelMetaSchema @@ -109,6 +110,24 @@ def package( ", ".join(meta["requirements"]), ) if name is not None: + if not name.isidentifier(): + msg.fail( + f"Model name ('{name}') is not a valid module name. " + "This is required so it can be imported as a module.", + "We recommend names that use ASCII A-Z, a-z, _ (underscore), " + "and 0-9. " + "For specific details see: https://docs.python.org/3/reference/lexical_analysis.html#identifiers", + exits=1, + ) + if not _is_permitted_package_name(name): + msg.fail( + f"Model name ('{name}') is not a permitted package name. " + "This is required to correctly load the model with spacy.load.", + "We recommend names that use ASCII A-Z, a-z, _ (underscore), " + "and 0-9. " + "For specific details see: https://www.python.org/dev/peps/pep-0426/#name", + exits=1, + ) meta["name"] = name if version is not None: meta["version"] = version @@ -162,7 +181,7 @@ def package( imports="\n".join(f"from . import {m}" for m in imports) ) create_file(package_path / "__init__.py", init_py) - msg.good(f"Successfully created package '{model_name_v}'", main_path) + msg.good(f"Successfully created package directory '{model_name_v}'", main_path) if create_sdist: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "sdist"], capture=False) @@ -171,8 +190,14 @@ def package( if create_wheel: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False) - wheel = main_path / "dist" / f"{model_name_v}{WHEEL_SUFFIX}" + wheel_name_squashed = re.sub("_+", "_", model_name_v) + wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}" msg.good(f"Successfully created binary wheel", wheel) + if "__" in model_name: + msg.warn( + f"Model name ('{model_name}') contains a run of underscores. " + "Runs of underscores are not significant in installed package names.", + ) def has_wheel() -> bool: @@ -422,6 +447,14 @@ def _format_label_scheme(data: Dict[str, Any]) -> str: return md.text +def _is_permitted_package_name(package_name: str) -> bool: + # regex from: https://www.python.org/dev/peps/pep-0426/#name + permitted_match = re.search( + r"^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$", package_name, re.IGNORECASE + ) + return permitted_match is not None + + TEMPLATE_SETUP = """ #!/usr/bin/env python import io diff --git a/spacy/errors.py b/spacy/errors.py index 390612123..b45c4f9db 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -483,7 +483,7 @@ class Errors(metaclass=ErrorsWithCodes): "components, since spans are only views of the Doc. Use Doc and " "Token attributes (or custom extension attributes) only and remove " "the following: {attrs}") - E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. " + E181 = ("Received invalid attributes for unknown object {obj}: {attrs}. " "Only Doc and Token attributes are supported.") E182 = ("Received invalid attribute declaration: {attr}\nDid you forget " "to define the attribute? For example: `{attr}.???`") diff --git a/spacy/glossary.py b/spacy/glossary.py index e45704fc5..57254330f 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -310,7 +310,6 @@ GLOSSARY = { "re": "repeated element", "rs": "reported speech", "sb": "subject", - "sb": "subject", "sbp": "passivized subject (PP)", "sp": "subject or predicate", "svp": "separable verb prefix", diff --git a/spacy/language.py b/spacy/language.py index fdce34ac4..e8fd2720c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -131,7 +131,7 @@ class Language: self, vocab: Union[Vocab, bool] = True, *, - max_length: int = 10 ** 6, + max_length: int = 10**6, meta: Dict[str, Any] = {}, create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None, batch_size: int = 1000, diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi index 82a194835..68e3386e4 100644 --- a/spacy/matcher/phrasematcher.pyi +++ b/spacy/matcher/phrasematcher.pyi @@ -14,7 +14,7 @@ class PhraseMatcher: def add( self, key: str, - docs: List[List[Dict[str, Any]]], + docs: List[Doc], *, on_match: Optional[ Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any] diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 9e1face63..a7d67c6dd 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -85,7 +85,7 @@ def get_characters_loss(ops, docs, prediction, nr_char): target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f") target = target.reshape((-1, 256 * nr_char)) diff = prediction - target - loss = (diff ** 2).sum() + loss = (diff**2).sum() d_target = diff / float(prediction.shape[0]) return loss, d_target diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index f5522f2d3..3759466d1 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -378,7 +378,7 @@ class SpanCategorizer(TrainablePipe): # If the prediction is 0.9 and it's false, the gradient will be # 0.9 (0.9 - 0.0) d_scores = scores - target - loss = float((d_scores ** 2).sum()) + loss = float((d_scores**2).sum()) return loss, d_scores def initialize( diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index dd5fdc078..64a452a7a 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -288,7 +288,7 @@ class TextCategorizer(TrainablePipe): bp_scores(gradient) if sgd is not None: self.finish_update(sgd) - losses[self.name] += (gradient ** 2).sum() + losses[self.name] += (gradient**2).sum() return losses def _examples_to_truth( @@ -322,7 +322,7 @@ class TextCategorizer(TrainablePipe): not_missing = self.model.ops.asarray(not_missing) # type: ignore d_scores = (scores - truths) d_scores *= not_missing - mean_square_error = (d_scores ** 2).mean() + mean_square_error = (d_scores**2).mean() return float(mean_square_error), d_scores def add_label(self, label: str) -> int: diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 10700b787..858c7cbb6 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -684,6 +684,7 @@ def test_has_annotation(en_vocab): attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE") for attr in attrs: assert not doc.has_annotation(attr) + assert not doc.has_annotation(attr, require_complete=True) doc[0].tag_ = "A" doc[0].pos_ = "X" @@ -709,6 +710,27 @@ def test_has_annotation(en_vocab): assert doc.has_annotation(attr, require_complete=True) +def test_has_annotation_sents(en_vocab): + doc = Doc(en_vocab, words=["Hello", "beautiful", "world"]) + attrs = ("SENT_START", "IS_SENT_START", "IS_SENT_END") + for attr in attrs: + assert not doc.has_annotation(attr) + assert not doc.has_annotation(attr, require_complete=True) + + # The first token (index 0) is always assumed to be a sentence start, + # and ignored by the check in doc.has_annotation + + doc[1].is_sent_start = False + for attr in attrs: + assert doc.has_annotation(attr) + assert not doc.has_annotation(attr, require_complete=True) + + doc[2].is_sent_start = False + for attr in attrs: + assert doc.has_annotation(attr) + assert doc.has_annotation(attr, require_complete=True) + + def test_is_flags_deprecated(en_tokenizer): doc = en_tokenizer("test") with pytest.deprecated_call(): diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index 75908df59..e20227455 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -12,6 +12,7 @@ def test_build_dependencies(): "flake8", "hypothesis", "pre-commit", + "black", "mypy", "types-dataclasses", "types-mock", diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 253469909..fc35ff86e 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -12,16 +12,18 @@ from spacy.cli._util import is_subpath_of, load_project_config from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import substitute_project_variables from spacy.cli._util import validate_project_commands -from spacy.cli.debug_data import _get_labels_from_model +from spacy.cli.debug_data import _compile_gold, _get_labels_from_model from spacy.cli.debug_data import _get_labels_from_spancat from spacy.cli.download import get_compatibility, get_version from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.package import get_third_party_dependencies +from spacy.cli.package import _is_permitted_package_name from spacy.cli.validate import get_model_pkgs from spacy.lang.en import English from spacy.lang.nl import Dutch from spacy.language import Language from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate +from spacy.tokens import Doc from spacy.training import Example, docs_to_json, offsets_to_biluo_tags from spacy.training.converters import conll_ner_to_docs, conllu_to_docs from spacy.training.converters import iob_to_docs @@ -692,3 +694,39 @@ def test_get_labels_from_model(factory_name, pipe_name): assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels) else: assert _get_labels_from_model(nlp, factory_name) == set(labels) + + +def test_permitted_package_names(): + # https://www.python.org/dev/peps/pep-0426/#name + assert _is_permitted_package_name("Meine_Bรคume") == False + assert _is_permitted_package_name("_package") == False + assert _is_permitted_package_name("package_") == False + assert _is_permitted_package_name(".package") == False + assert _is_permitted_package_name("package.") == False + assert _is_permitted_package_name("-package") == False + assert _is_permitted_package_name("package-") == False + + +def test_debug_data_compile_gold(): + nlp = English() + pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"]) + ref = Doc( + nlp.vocab, + words=["Token", ".", "New York City"], + sent_starts=[True, False, True], + ents=["O", "O", "B-ENT"], + ) + eg = Example(pred, ref) + data = _compile_gold([eg], ["ner"], nlp, True) + assert data["boundary_cross_ents"] == 0 + + pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"]) + ref = Doc( + nlp.vocab, + words=["Token", ".", "New York City"], + sent_starts=[True, False, True], + ents=["O", "B-ENT", "I-ENT"], + ) + eg = Example(pred, ref) + data = _compile_gold([eg], ["ner"], nlp, True) + assert data["boundary_cross_ents"] == 1 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 5a0db115d..d33764ac9 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -420,6 +420,8 @@ cdef class Doc: cdef int range_start = 0 if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]: attr = SENT_START + elif attr == "IS_SENT_END" or attr == self.vocab.strings["IS_SENT_END"]: + attr = SENT_START attr = intify_attr(attr) # adjust attributes if attr == HEAD: diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index b515ab67b..d14930348 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -487,8 +487,6 @@ cdef class Token: RETURNS (bool / None): Whether the token starts a sentence. None if unknown. - - DOCS: https://spacy.io/api/token#is_sent_start """ def __get__(self): if self.c.sent_start == 0: diff --git a/spacy/util.py b/spacy/util.py index 14714143c..2a8b9f5cc 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -871,7 +871,6 @@ def get_package_path(name: str) -> Path: name (str): Package name. RETURNS (Path): Path to installed package. """ - name = name.lower() # use lowercase version to be safe # Here we're importing the module just to find it. This is worryingly # indirect, but it's otherwise very difficult to find the package. pkg = importlib.import_module(name) diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index 986c6f458..35afc8fea 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -79,6 +79,7 @@ train/test skew. | `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | | `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~ | +| `shuffle` | Whether to shuffle the examples. Defaults to `False`. ~~bool~~ | ## Corpus.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 9836b8c21..c21328caf 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -304,7 +304,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor. ## Doc.has_annotation {#has_annotation tag="method"} -Check whether the doc contains annotation on a token attribute. +Check whether the doc contains annotation on a [`Token` attribute](/api/token#attributes). diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 44a2ea9e8..3c3d12d54 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -349,23 +349,6 @@ A sequence containing the token and all the token's syntactic descendants. | ---------- | ------------------------------------------------------------------------------------ | | **YIELDS** | A descendant token such that `self.is_ancestor(token)` or `token == self`. ~~Token~~ | -## Token.is_sent_start {#is_sent_start tag="property" new="2"} - -A boolean value indicating whether the token starts a sentence. `None` if -unknown. Defaults to `True` for the first token in the `Doc`. - -> #### Example -> -> ```python -> doc = nlp("Give it back! He pleaded.") -> assert doc[4].is_sent_start -> assert not doc[5].is_sent_start -> ``` - -| Name | Description | -| ----------- | ------------------------------------------------------- | -| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ | - ## Token.has_vector {#has_vector tag="property" model="vectors"} A boolean value indicating whether a word vector is associated with the token. @@ -465,6 +448,8 @@ The L2 norm of the token's vector representation. | `is_punct` | Is the token punctuation? ~~bool~~ | | `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ | | `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ | +| `is_sent_start` | Does the token start a sentence? ~~bool~~ or `None` if unknown. Defaults to `True` for the first token in the `Doc`. | +| `is_sent_end` | Does the token end a sentence? ~~bool~~ or `None` if unknown. | | `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ | | `is_bracket` | Is the token a bracket? ~~bool~~ | | `is_quote` | Is the token a quotation mark? ~~bool~~ | diff --git a/website/docs/images/spacy-tailored-pipelines_wide.png b/website/docs/images/spacy-tailored-pipelines_wide.png new file mode 100644 index 000000000..d1a762ebe Binary files /dev/null and b/website/docs/images/spacy-tailored-pipelines_wide.png differ diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index e0e787a1d..57d226913 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -213,6 +213,12 @@ format, train a pipeline, evaluate it and export metrics, package it and spin up a quick web demo. It looks pretty similar to a config file used to define CI pipelines. +> #### Tip: Multi-line YAML syntax for long values +> +> YAML has [multi-line syntax](https://yaml-multiline.info/) that can be +> helpful for readability with longer values such as project descriptions or +> commands that take several arguments. + ```yaml %%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml ``` diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 1054f7626..c49b49c73 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -40,7 +40,11 @@ "label": "Resources", "items": [ { "text": "Project Templates", "url": "https://github.com/explosion/projects" }, - { "text": "v2.x Documentation", "url": "https://v2.spacy.io" } + { "text": "v2.x Documentation", "url": "https://v2.spacy.io" }, + { + "text": "Custom Solutions", + "url": "https://explosion.ai/spacy-tailored-pipelines" + } ] } ] diff --git a/website/meta/site.json b/website/meta/site.json index 169680f86..9ecaef74c 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -48,7 +48,11 @@ { "text": "Usage", "url": "/usage" }, { "text": "Models", "url": "/models" }, { "text": "API Reference", "url": "/api" }, - { "text": "Online Course", "url": "https://course.spacy.io" } + { "text": "Online Course", "url": "https://course.spacy.io" }, + { + "text": "Custom Solutions", + "url": "https://explosion.ai/spacy-tailored-pipelines" + } ] }, { diff --git a/website/meta/universe.json b/website/meta/universe.json index b1a61598e..122281583 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -953,6 +953,37 @@ "category": ["pipeline"], "tags": ["lemmatizer", "danish"] }, + { + "id": "augmenty", + "title": "Augmenty", + "slogan": "The cherry on top of your NLP pipeline", + "description": "Augmenty is an augmentation library based on spaCy for augmenting texts. Augmenty differs from other augmentation libraries in that it corrects (as far as possible) the token, sentence and document labels under the augmentation.", + "github": "kennethenevoldsen/augmenty", + "pip": "augmenty", + "code_example": [ + "import spacy", + "import augmenty", + "", + "nlp = spacy.load('en_core_web_md')", + "", + "docs = nlp.pipe(['Augmenty is a great tool for text augmentation'])", + "", + "ent_dict = {'ORG': [['spaCy'], ['spaCy', 'Universe']]}", + "entity_augmenter = augmenty.load('ents_replace.v1',", + " ent_dict = ent_dict, level=1)", + "", + "for doc in augmenty.docs(docs, augmenter=entity_augmenter, nlp=nlp):", + " print(doc)" + ], + "thumb": "https://github.com/KennethEnevoldsen/augmenty/blob/master/img/icon.png?raw=true", + "author": "Kenneth Enevoldsen", + "author_links": { + "github": "kennethenevoldsen", + "website": "https://www.kennethenevoldsen.com" + }, + "category": ["training", "research"], + "tags": ["training", "research", "augmentation"] + }, { "id": "dacy", "title": "DaCy", @@ -3738,6 +3769,65 @@ }, "category": ["pipeline"], "tags": ["pipeline", "nlp", "sentiment"] + }, + { + "id": "textnets", + "slogan": "Text analysis with networks", + "description": "textnets represents collections of texts as networks of documents and words. This provides novel possibilities for the visualization and analysis of texts.", + "github": "jboynyc/textnets", + "image": "https://user-images.githubusercontent.com/2187261/152641425-6c0fb41c-b8e0-44fb-a52a-7c1ba24eba1e.png", + "code_example": [ + "import textnets as tn", + "", + "corpus = tn.Corpus(tn.examples.moon_landing)", + "t = tn.Textnet(corpus.tokenized(), min_docs=1)", + "t.plot(label_nodes=True,", + " show_clusters=True,", + " scale_nodes_by=\"birank\",", + " scale_edges_by=\"weight\")" + ], + "author": "John Boy", + "author_links": { + "github": "jboynyc", + "twitter": "jboy" + }, + "category": ["visualizers", "standalone"] + }, + { + "id": "tmtoolkit", + "slogan": "Text mining and topic modeling toolkit", + "description": "tmtoolkit is a set of tools for text mining and topic modeling with Python developed especially for the use in the social sciences, in journalism or related disciplines. It aims for easy installation, extensive documentation and a clear programming interface while offering good performance on large datasets by the means of vectorized operations (via NumPy) and parallel computation (using Pythonโ€™s multiprocessing module and the loky package).", + "github": "WZBSocialScienceCenter/tmtoolkit", + "code_example": [ + "# Note: This requires these setup steps:", + "# pip install tmtoolkit[recommended]", + "# python -m tmtoolkit setup en", + "from tmtoolkit.corpus import Corpus, tokens_table, lemmatize, to_lowercase, dtm", + "from tmtoolkit.bow.bow_stats import tfidf, sorted_terms_table", + "# load built-in sample dataset and use 4 worker processes", + "corp = Corpus.from_builtin_corpus('en-News100', max_workers=4)", + "# investigate corpus as dataframe", + "toktbl = tokens_table(corp)", + "print(toktbl)", + "# apply some text normalization", + "lemmatize(corp)", + "to_lowercase(corp)", + "# build sparse document-token matrix (DTM)", + "# document labels identify rows, vocabulary tokens identify columns", + "mat, doc_labels, vocab = dtm(corp, return_doc_labels=True, return_vocab=True)", + "# apply tf-idf transformation to DTM", + "# operation is applied on sparse matrix and uses few memory", + "tfidf_mat = tfidf(mat)", + "# show top 5 tokens per document ranked by tf-idf", + "top_tokens = sorted_terms_table(tfidf_mat, vocab, doc_labels, top_n=5)", + "print(top_tokens)" + ], + "author": "Markus Konrad / WZB Social Science Center", + "author_links": { + "github": "internaut", + "twitter": "_knrd" + }, + "category": ["scientific", "standalone"] } ], diff --git a/website/src/components/list.js b/website/src/components/list.js index e0a3d9b64..d31617487 100644 --- a/website/src/components/list.js +++ b/website/src/components/list.js @@ -6,11 +6,14 @@ import { replaceEmoji } from './icon' export const Ol = props =>
    export const Ul = props =>