From bef9f63e1391d89d9e246855e9f769b3172a2c18 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 21 Sep 2023 11:28:58 +0200 Subject: [PATCH 01/25] Add gpt-3.5-turbo-instruct to list of supported OpenAI models. --- website/docs/api/large-language-models.mdx | 68 +++++++++++----------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/website/docs/api/large-language-models.mdx b/website/docs/api/large-language-models.mdx index d32368e22..43a95074a 100644 --- a/website/docs/api/large-language-models.mdx +++ b/website/docs/api/large-language-models.mdx @@ -19,8 +19,8 @@ prototyping** and **prompting**, and turning unstructured responses into An LLM component is implemented through the `LLMWrapper` class. It is accessible through a generic `llm` [component factory](https://spacy.io/usage/processing-pipelines#custom-components-factories) -as well as through task-specific component factories: `llm_ner`, `llm_spancat`, `llm_rel`, -`llm_textcat`, `llm_sentiment` and `llm_summarization`. +as well as through task-specific component factories: `llm_ner`, `llm_spancat`, +`llm_rel`, `llm_textcat`, `llm_sentiment` and `llm_summarization`. ### LLMWrapper.\_\_init\_\_ {id="init",tag="method"} @@ -952,38 +952,38 @@ provider's API. Currently, these models are provided as part of the core library: -| Model | Provider | Supported names | Default name | Default config | -| ----------------------------- | --------- | ---------------------------------------------------------------------------------------- | ---------------------- | ------------------------------------ | -| `spacy.GPT-4.v1` | OpenAI | `["gpt-4", "gpt-4-0314", "gpt-4-32k", "gpt-4-32k-0314"]` | `"gpt-4"` | `{}` | -| `spacy.GPT-4.v2` | OpenAI | `["gpt-4", "gpt-4-0314", "gpt-4-32k", "gpt-4-32k-0314"]` | `"gpt-4"` | `{temperature=0.0}` | -| `spacy.GPT-3-5.v1` | OpenAI | `["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-0613-16k"]` | `"gpt-3.5-turbo"` | `{}` | -| `spacy.GPT-3-5.v2` | OpenAI | `["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-0613-16k"]` | `"gpt-3.5-turbo"` | `{temperature=0.0}` | -| `spacy.Davinci.v1` | OpenAI | `["davinci"]` | `"davinci"` | `{}` | -| `spacy.Davinci.v2` | OpenAI | `["davinci"]` | `"davinci"` | `{temperature=0.0, max_tokens=500}` | -| `spacy.Text-Davinci.v1` | OpenAI | `["text-davinci-003", "text-davinci-002"]` | `"text-davinci-003"` | `{}` | -| `spacy.Text-Davinci.v2` | OpenAI | `["text-davinci-003", "text-davinci-002"]` | `"text-davinci-003"` | `{temperature=0.0, max_tokens=1000}` | -| `spacy.Code-Davinci.v1` | OpenAI | `["code-davinci-002"]` | `"code-davinci-002"` | `{}` | -| `spacy.Code-Davinci.v2` | OpenAI | `["code-davinci-002"]` | `"code-davinci-002"` | `{temperature=0.0, max_tokens=500}` | -| `spacy.Curie.v1` | OpenAI | `["curie"]` | `"curie"` | `{}` | -| `spacy.Curie.v2` | OpenAI | `["curie"]` | `"curie"` | `{temperature=0.0, max_tokens=500}` | -| `spacy.Text-Curie.v1` | OpenAI | `["text-curie-001"]` | `"text-curie-001"` | `{}` | -| `spacy.Text-Curie.v2` | OpenAI | `["text-curie-001"]` | `"text-curie-001"` | `{temperature=0.0, max_tokens=500}` | -| `spacy.Babbage.v1` | OpenAI | `["babbage"]` | `"babbage"` | `{}` | -| `spacy.Babbage.v2` | OpenAI | `["babbage"]` | `"babbage"` | `{temperature=0.0, max_tokens=500}` | -| `spacy.Text-Babbage.v1` | OpenAI | `["text-babbage-001"]` | `"text-babbage-001"` | `{}` | -| `spacy.Text-Babbage.v2` | OpenAI | `["text-babbage-001"]` | `"text-babbage-001"` | `{temperature=0.0, max_tokens=500}` | -| `spacy.Ada.v1` | OpenAI | `["ada"]` | `"ada"` | `{}` | -| `spacy.Ada.v2` | OpenAI | `["ada"]` | `"ada"` | `{temperature=0.0, max_tokens=500}` | -| `spacy.Text-Ada.v1` | OpenAI | `["text-ada-001"]` | `"text-ada-001"` | `{}` | -| `spacy.Text-Ada.v2` | OpenAI | `["text-ada-001"]` | `"text-ada-001"` | `{temperature=0.0, max_tokens=500}` | -| `spacy.Command.v1` | Cohere | `["command", "command-light", "command-light-nightly", "command-nightly"]` | `"command"` | `{}` | -| `spacy.Claude-2.v1` | Anthropic | `["claude-2", "claude-2-100k"]` | `"claude-2"` | `{}` | -| `spacy.Claude-1.v1` | Anthropic | `["claude-1", "claude-1-100k"]` | `"claude-1"` | `{}` | -| `spacy.Claude-1-0.v1` | Anthropic | `["claude-1.0"]` | `"claude-1.0"` | `{}` | -| `spacy.Claude-1-2.v1` | Anthropic | `["claude-1.2"]` | `"claude-1.2"` | `{}` | -| `spacy.Claude-1-3.v1` | Anthropic | `["claude-1.3", "claude-1.3-100k"]` | `"claude-1.3"` | `{}` | -| `spacy.Claude-instant-1.v1` | Anthropic | `["claude-instant-1", "claude-instant-1-100k"]` | `"claude-instant-1"` | `{}` | -| `spacy.Claude-instant-1-1.v1` | Anthropic | `["claude-instant-1.1", "claude-instant-1.1-100k"]` | `"claude-instant-1.1"` | `{}` | +| Model | Provider | Supported names | Default name | Default config | +| ----------------------------- | --------- | ------------------------------------------------------------------------------------------------------------------ | ---------------------- | ------------------------------------ | +| `spacy.GPT-4.v1` | OpenAI | `["gpt-4", "gpt-4-0314", "gpt-4-32k", "gpt-4-32k-0314"]` | `"gpt-4"` | `{}` | +| `spacy.GPT-4.v2` | OpenAI | `["gpt-4", "gpt-4-0314", "gpt-4-32k", "gpt-4-32k-0314"]` | `"gpt-4"` | `{temperature=0.0}` | +| `spacy.GPT-3-5.v1` | OpenAI | `["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-0613-16k", "gpt-3.5-turbo-instruct"]` | `"gpt-3.5-turbo"` | `{}` | +| `spacy.GPT-3-5.v2` | OpenAI | `["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-0613-16k", "gpt-3.5-turbo-instruct"]` | `"gpt-3.5-turbo"` | `{temperature=0.0}` | +| `spacy.Davinci.v1` | OpenAI | `["davinci"]` | `"davinci"` | `{}` | +| `spacy.Davinci.v2` | OpenAI | `["davinci"]` | `"davinci"` | `{temperature=0.0, max_tokens=500}` | +| `spacy.Text-Davinci.v1` | OpenAI | `["text-davinci-003", "text-davinci-002"]` | `"text-davinci-003"` | `{}` | +| `spacy.Text-Davinci.v2` | OpenAI | `["text-davinci-003", "text-davinci-002"]` | `"text-davinci-003"` | `{temperature=0.0, max_tokens=1000}` | +| `spacy.Code-Davinci.v1` | OpenAI | `["code-davinci-002"]` | `"code-davinci-002"` | `{}` | +| `spacy.Code-Davinci.v2` | OpenAI | `["code-davinci-002"]` | `"code-davinci-002"` | `{temperature=0.0, max_tokens=500}` | +| `spacy.Curie.v1` | OpenAI | `["curie"]` | `"curie"` | `{}` | +| `spacy.Curie.v2` | OpenAI | `["curie"]` | `"curie"` | `{temperature=0.0, max_tokens=500}` | +| `spacy.Text-Curie.v1` | OpenAI | `["text-curie-001"]` | `"text-curie-001"` | `{}` | +| `spacy.Text-Curie.v2` | OpenAI | `["text-curie-001"]` | `"text-curie-001"` | `{temperature=0.0, max_tokens=500}` | +| `spacy.Babbage.v1` | OpenAI | `["babbage"]` | `"babbage"` | `{}` | +| `spacy.Babbage.v2` | OpenAI | `["babbage"]` | `"babbage"` | `{temperature=0.0, max_tokens=500}` | +| `spacy.Text-Babbage.v1` | OpenAI | `["text-babbage-001"]` | `"text-babbage-001"` | `{}` | +| `spacy.Text-Babbage.v2` | OpenAI | `["text-babbage-001"]` | `"text-babbage-001"` | `{temperature=0.0, max_tokens=500}` | +| `spacy.Ada.v1` | OpenAI | `["ada"]` | `"ada"` | `{}` | +| `spacy.Ada.v2` | OpenAI | `["ada"]` | `"ada"` | `{temperature=0.0, max_tokens=500}` | +| `spacy.Text-Ada.v1` | OpenAI | `["text-ada-001"]` | `"text-ada-001"` | `{}` | +| `spacy.Text-Ada.v2` | OpenAI | `["text-ada-001"]` | `"text-ada-001"` | `{temperature=0.0, max_tokens=500}` | +| `spacy.Command.v1` | Cohere | `["command", "command-light", "command-light-nightly", "command-nightly"]` | `"command"` | `{}` | +| `spacy.Claude-2.v1` | Anthropic | `["claude-2", "claude-2-100k"]` | `"claude-2"` | `{}` | +| `spacy.Claude-1.v1` | Anthropic | `["claude-1", "claude-1-100k"]` | `"claude-1"` | `{}` | +| `spacy.Claude-1-0.v1` | Anthropic | `["claude-1.0"]` | `"claude-1.0"` | `{}` | +| `spacy.Claude-1-2.v1` | Anthropic | `["claude-1.2"]` | `"claude-1.2"` | `{}` | +| `spacy.Claude-1-3.v1` | Anthropic | `["claude-1.3", "claude-1.3-100k"]` | `"claude-1.3"` | `{}` | +| `spacy.Claude-instant-1.v1` | Anthropic | `["claude-instant-1", "claude-instant-1-100k"]` | `"claude-instant-1"` | `{}` | +| `spacy.Claude-instant-1-1.v1` | Anthropic | `["claude-instant-1.1", "claude-instant-1.1-100k"]` | `"claude-instant-1.1"` | `{}` | To use these models, make sure that you've [set the relevant API](#api-keys) keys as environment variables. From 1b043dde3fc674869f11b8b138db878552b4c91a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 9 Aug 2023 13:43:50 +0200 Subject: [PATCH 02/25] Revert "disable tests until 3.7 models are available" This reverts commit 991bcc111e1a35cc96dba32ac08c212b0b360384. --- .github/workflows/tests.yml | 54 ++++++++++++++++++------------------- spacy/tests/test_cli.py | 2 -- 2 files changed, 27 insertions(+), 29 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1058b4673..976b1f4f2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -115,22 +115,22 @@ jobs: - name: Test import run: python -W error -c "import spacy" -# - name: "Test download CLI" -# run: | -# python -m spacy download ca_core_news_sm -# python -m spacy download ca_core_news_md -# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" -# if: matrix.python_version == '3.9' -# -# - name: "Test download_url in info CLI" -# run: | -# python -W error -m spacy info ca_core_news_sm | grep -q download_url -# if: matrix.python_version == '3.9' -# -# - name: "Test no warnings on load (#11713)" -# run: | -# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" -# if: matrix.python_version == '3.9' + - name: "Test download CLI" + run: | + python -m spacy download ca_core_news_sm + python -m spacy download ca_core_news_md + python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" + if: matrix.python_version == '3.9' + + - name: "Test download_url in info CLI" + run: | + python -W error -m spacy info ca_core_news_sm | grep -q download_url + if: matrix.python_version == '3.9' + + - name: "Test no warnings on load (#11713)" + run: | + python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" + if: matrix.python_version == '3.9' - name: "Test convert CLI" run: | @@ -154,17 +154,17 @@ jobs: python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 if: matrix.python_version == '3.9' -# - name: "Test assemble CLI" -# run: | -# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" -# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir -# if: matrix.python_version == '3.9' -# -# - name: "Test assemble CLI vectors warning" -# run: | -# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" -# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 -# if: matrix.python_version == '3.9' + - name: "Test assemble CLI" + run: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" + PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir + if: matrix.python_version == '3.9' + + - name: "Test assemble CLI vectors warning" + run: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" + python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 + if: matrix.python_version == '3.9' - name: "Install test requirements" run: | diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 8c1d1a64c..0d2fe0a9e 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -538,7 +538,6 @@ def test_string_to_list_intify(value): assert string_to_list(value, intify=True) == [1, 2, 3] -@pytest.mark.skip(reason="Temporarily skip before 3.7 models are published") def test_download_compatibility(): spec = SpecifierSet("==" + about.__version__) spec.prereleases = False @@ -549,7 +548,6 @@ def test_download_compatibility(): assert get_minor_version(about.__version__) == get_minor_version(version) -@pytest.mark.skip(reason="Temporarily skip before 3.7 models are published") def test_validate_compatibility_table(): spec = SpecifierSet("==" + about.__version__) spec.prereleases = False From 160e61772e3e4fbd4e9e28446c6d687596921f93 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 1 Oct 2023 21:40:07 +0200 Subject: [PATCH 03/25] Docs for v3.7.0 (#13029) * Docs for v3.7.0 * Minor fixes * Extend Weasel notes * Minor edits * Update version in README --- README.md | 2 +- website/docs/usage/v3-7.mdx | 140 +++++++++++++++++++++++++++++++++ website/meta/sidebars.json | 3 +- website/src/templates/index.js | 4 +- 4 files changed, 145 insertions(+), 4 deletions(-) create mode 100644 website/docs/usage/v3-7.mdx diff --git a/README.md b/README.md index 3920c1dc2..b2ffa4639 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ model packaging, deployment and workflow management. spaCy is commercial open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE). -💫 **Version 3.6 out now!** +💫 **Version 3.7 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases) [![tests](https://github.com/explosion/spaCy/actions/workflows/tests.yml/badge.svg)](https://github.com/explosion/spaCy/actions/workflows/tests.yml) diff --git a/website/docs/usage/v3-7.mdx b/website/docs/usage/v3-7.mdx new file mode 100644 index 000000000..76fc9530f --- /dev/null +++ b/website/docs/usage/v3-7.mdx @@ -0,0 +1,140 @@ +--- +title: What's New in v3.7 +teaser: New features and how to upgrade +menu: + - ['New Features', 'features'] + - ['Upgrading Notes', 'upgrading'] +--- + +## New features {id="features",hidden="true"} + +spaCy v3.7 adds support for Python 3.12, introduces the new standalone library +[Weasel](https://github.com/explosion/weasel) for project workflows, and updates +the transformer-based trained pipelines to use our new +[Curated Transformers](https://github.com/explosion/curated-transformers) +library. + +This release drops support for Python 3.6. + +### Weasel {id="weasel"} + +The [spaCy projects](/usage/projects) functionality has been moved into a new +standalone library [Weasel](https://github.com/explosion/weasel). This brings +minor changes to spaCy-specific settings in spaCy projects (see +[upgrading](#upgrading) below), but also makes it possible to use the same +workflow functionality outside of spaCy. + +All `spacy project` commands should run as before, just now they're using Weasel +under the hood. + + + +Remote storage for spaCy projects is not yet supported for Python 3.12. Use +Python 3.11 or earlier for remote storage. + + + +### Registered vectors {id="custom-vectors"} + +You can specify a custom registered vectors class under `[nlp.vectors]` in order +to use static vectors in formats other than the ones supported by +[`Vectors`](/api/vectors). To implement your custom vectors, extend the abstract +class [`BaseVectors`](/api/basevectors). See an example using +[BPEmb subword embeddings](/usage/embeddings-transformers#custom-vectors). + +### Additional features and improvements {id="additional-features-and-improvements"} + +- Add support for Python 3.12. +- Extend to Thinc v8.2. +- Extend `transformers` extra to `spacy-transformers` v1.3. +- Add `--spans-key` option for CLI evaluation with `spacy benchmark accuracy`. +- Load the CLI module lazily for `spacy.info`. +- Add type stubs for for `spacy.training.example`. +- Warn for unsupported pattern keys in dependency matcher. +- `Language.replace_listeners`: Pass the replaced listener and the `tok2vec` + pipe to the callback in order to support `spacy-curated-transformers`. +- Always use `tqdm` with `disable=None` in order to disable output in + non-interactive environments. +- Language updates: + - Add left and right pointing angle brackets as punctuation to ancient Greek. + - Update example sentences for Turkish. +- Package setup updates: + - Update NumPy build constraints for NumPy 1.25+. For Python 3.9+, it is no + longer necessary to set build constraints while building binary wheels. + - Refactor Cython profiling in order to disable profiling for Python 3.12 in + the package setup, since Cython does not currently support profiling for + Python 3.12. + +## Trained pipelines {id="pipelines"} + +### Pipeline updates {id="pipeline-updates"} + +The transformer-based `trf` pipelines have been updated to use our new +[Curated Transformers](https://github.com/explosion/curated-transformers) +library using the Thinc model wrappers and pipeline component from +[spaCy Curated Transformers](https://github.com/explosion/spacy-curated-transformers). + +## Notes about upgrading from v3.6 {id="upgrading"} + +This release drops support for Python 3.6, drops mypy checks for Python 3.7 and +removes the `ray` extra. In addition there are several minor changes for spaCy +projects described in the following section. + +### Backwards incompatibilities for spaCy Projects {id="upgrading-projects"} + +`spacy project` has a few backwards incompatibilities due to the transition to +the standalone library [Weasel](https://github.com/explosion/weasel), which is +not as tightly coupled to spaCy. Weasel produces warnings when it detects older +spaCy-specific settings in your environment or project config. + +- Support for the `spacy_version` configuration key has been dropped. +- Support for the `check_requirements` configuration key has been dropped due to + the deprecation of `pkg_resources`. +- The `SPACY_CONFIG_OVERRIDES` environment variable is no longer checked. You + can set configuration overrides using `WEASEL_CONFIG_OVERRIDES`. +- Support for `SPACY_PROJECT_USE_GIT_VERSION` environment variable has been + dropped. +- Error codes are now Weasel-specific and do not follow spaCy error codes. + +### Pipeline package version compatibility {id="version-compat"} + +> #### Using legacy implementations +> +> In spaCy v3, you'll still be able to load and reference legacy implementations +> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the +> components or architectures change and newer versions are available in the +> core library. + +When you're loading a pipeline package trained with an earlier version of spaCy +v3, you will see a warning telling you that the pipeline may be incompatible. +This doesn't necessarily have to be true, but we recommend running your +pipelines against your test suite or evaluation data to make sure there are no +unexpected results. + +If you're using one of the [trained pipelines](/models) we provide, you should +run [`spacy download`](/api/cli#download) to update to the latest version. To +see an overview of all installed packages and their compatibility, you can run +[`spacy validate`](/api/cli#validate). + +If you've trained your own custom pipeline and you've confirmed that it's still +working as expected, you can update the spaCy version requirements in the +[`meta.json`](/api/data-formats#meta): + +```diff +- "spacy_version": ">=3.6.0,<3.7.0", ++ "spacy_version": ">=3.6.0,<3.8.0", +``` + +### Updating v3.6 configs + +To update a config from spaCy v3.6 with the new v3.7 settings, run +[`init fill-config`](/api/cli#init-fill-config): + +```cli +$ python -m spacy init fill-config config-v3.6.cfg config-v3.7.cfg +``` + +In many cases ([`spacy train`](/api/cli#train), +[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in +automatically, but you'll need to fill in the new settings to run +[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data). diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 617473cb0..24213ed12 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -15,7 +15,8 @@ { "text": "New in v3.3", "url": "/usage/v3-3" }, { "text": "New in v3.4", "url": "/usage/v3-4" }, { "text": "New in v3.5", "url": "/usage/v3-5" }, - { "text": "New in v3.6", "url": "/usage/v3-6" } + { "text": "New in v3.6", "url": "/usage/v3-6" }, + { "text": "New in v3.7", "url": "/usage/v3-7" } ] }, { diff --git a/website/src/templates/index.js b/website/src/templates/index.js index c8295593c..1c969bd39 100644 --- a/website/src/templates/index.js +++ b/website/src/templates/index.js @@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => { } const navAlert = ( - - 💥 Out now: spaCy v3.6 + + 💥 Out now: spaCy v3.7 ) From 92ce32aa3f04b2d7fac2db0b5bfe3411c8709d9e Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 2 Oct 2023 12:53:46 +0200 Subject: [PATCH 04/25] Update binder version to v3.7 (#13034) --- website/meta/site.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/site.json b/website/meta/site.json index 08fcde62e..a07d131d3 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -27,7 +27,7 @@ "indexName": "spacy" }, "binderUrl": "explosion/spacy-io-binder", - "binderVersion": "3.6", + "binderVersion": "3.7", "sections": [ { "id": "usage", "title": "Usage Documentation", "theme": "blue" }, { "id": "models", "title": "Models Documentation", "theme": "blue" }, From 6d0185f7fba4d8a4f76a9c35d2e78542ee0c226a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 4 Oct 2023 12:33:33 +0200 Subject: [PATCH 05/25] Revert "Load the cli module lazily for spacy.info (#12962)" This reverts commit beda27a91eadd70563dbaffd844d8c9d5e245928. --- spacy/__init__.py | 7 +------ spacy/tests/test_cli.py | 4 ---- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 8aa2eccd7..1a18ad0d5 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -13,6 +13,7 @@ from thinc.api import Config, prefer_gpu, require_cpu, require_gpu # noqa: F401 from . import pipeline # noqa: F401 from . import util from .about import __version__ # noqa: F401 +from .cli.info import info # noqa: F401 from .errors import Errors from .glossary import explain # noqa: F401 from .language import Language @@ -76,9 +77,3 @@ def blank( # We should accept both dot notation and nested dict here for consistency config = util.dot_to_dict(config) return LangClass.from_config(config, vocab=vocab, meta=meta) - - -def info(*args, **kwargs): - from .cli.info import info as cli_info - - return cli_info(*args, **kwargs) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 0d2fe0a9e..86451317b 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -12,7 +12,6 @@ from thinc.api import Config import spacy from spacy import about -from spacy import info as spacy_info from spacy.cli import info from spacy.cli._util import parse_config_overrides, string_to_list, walk_directory from spacy.cli.apply import apply @@ -193,9 +192,6 @@ def test_cli_info(): raw_data = info(tmp_dir, exclude=[""]) assert raw_data["lang"] == "nl" assert raw_data["components"] == ["textcat"] - raw_data = spacy_info(tmp_dir, exclude=[""]) - assert raw_data["lang"] == "nl" - assert raw_data["components"] == ["textcat"] def test_cli_converters_conllu_to_docs(): From 9d036607f1ad60ebf1719526c0ec1f531eb688e9 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 4 Oct 2023 18:13:12 +0200 Subject: [PATCH 06/25] Set version to v3.7.1 (#13042) --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 1a3367673..0e718400b 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,5 +1,5 @@ # fmt: off __title__ = "spacy" -__version__ = "3.7.0" +__version__ = "3.7.1" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 734826db79fc27b4632c364d04b4ddd450f1772d Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 5 Oct 2023 08:45:25 +0200 Subject: [PATCH 07/25] Update `spacy-llm` task argument docs w.r.t. task refactoring (#12995) * Update task arguments w.r.t. task refactoring in 0.5.0. * Add disclaimer w.r.t. gated models/Llama 2. * Update website/docs/api/large-language-models.mdx * Update website/docs/api/large-language-models.mdx --- website/docs/api/large-language-models.mdx | 271 ++++++++++++--------- 1 file changed, 162 insertions(+), 109 deletions(-) diff --git a/website/docs/api/large-language-models.mdx b/website/docs/api/large-language-models.mdx index 43a95074a..845edaa1a 100644 --- a/website/docs/api/large-language-models.mdx +++ b/website/docs/api/large-language-models.mdx @@ -254,12 +254,14 @@ prompting. > max_n_words = null > ``` -| Argument | Description | -| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `template` | Custom prompt template to send to LLM model. Defaults to [summarization.v1.jinja](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/summarization.v1.jinja). ~~str~~ | -| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | -| `max_n_words` | Maximum number of words to be used in summary. Note that this should not expected to work exactly. Defaults to `None`. ~~Optional[int]~~ | -| `field` | Name of extension attribute to store summary in (i. e. the summary will be available in `doc._.{field}`). Defaults to `summary`. ~~str~~ | +| Argument | Description | +| --------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `template` | Custom prompt template to send to LLM model. Defaults to [summarization.v1.jinja](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/summarization.v1.jinja). ~~str~~ | +| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | +| `parse_responses` (NEW) | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[SummarizationTask]]~~ | +| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `SummarizationExample`. ~~Optional[Type[FewshotExample]]~~ | +| `max_n_words` | Maximum number of words to be used in summary. Note that this should not expected to work exactly. Defaults to `None`. ~~Optional[int]~~ | +| `field` | Name of extension attribute to store summary in (i. e. the summary will be available in `doc._.{field}`). Defaults to `summary`. ~~str~~ | The summarization task prompts the model for a concise summary of the provided text. It optionally allows to limit the response to a certain number of tokens - @@ -325,16 +327,19 @@ When no examples are [specified](/usage/large-language-models#few-shot-prompts), the v3 implementation will use a dummy example in the prompt. Technically this means that the task will always perform few-shot prompting under the hood. -| Argument | Description | -| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `labels` | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~ | -| `label_definitions` | Optional dict mapping a label to a description of that label. These descriptions are added to the prompt to help instruct the LLM on what to extract. Defaults to `None`. ~~Optional[Dict[str, str]]~~ | -| `template` | Custom prompt template to send to LLM model. Defaults to [ner.v3.jinja](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/ner.v3.jinja). ~~str~~ | -| `description` (NEW) | A description of what to recognize or not recognize as entities. ~~str~~ | -| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | -| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. Defaults to `None`. ~~Optional[Callable[[str], str]]~~ | -| `alignment_mode` | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~ | -| `case_sensitive_matching` | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~ | +| Argument | Description | +| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `template` | Custom prompt template to send to LLM model. Defaults to [ner.v3.jinja](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/ner.v3.jinja). ~~str~~ | +| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | +| `parse_responses` (NEW) | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[NERTask]]~~ | +| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `NERExample`. ~~Optional[Type[FewshotExample]]~~ | +| `scorer` | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~ | +| `labels` | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~ | +| `label_definitions` | Optional dict mapping a label to a description of that label. These descriptions are added to the prompt to help instruct the LLM on what to extract. Defaults to `None`. ~~Optional[Dict[str, str]]~~ | +| `description` (NEW) | A description of what to recognize or not recognize as entities. ~~str~~ | +| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. Defaults to `None`. ~~Optional[Callable[[str], str]]~~ | +| `alignment_mode` | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~ | +| `case_sensitive_matching` | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~ | Note that the `single_match` parameter, used in v1 and v2, is not supported anymore, as the CoT parsing algorithm takes care of this automatically. @@ -415,16 +420,19 @@ v1. > examples = null > ``` -| Argument | Description | -| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `labels` | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~ | -| `label_definitions` (NEW) | Optional dict mapping a label to a description of that label. These descriptions are added to the prompt to help instruct the LLM on what to extract. Defaults to `None`. ~~Optional[Dict[str, str]]~~ | -| `template` (NEW) | Custom prompt template to send to LLM model. Defaults to [ner.v2.jinja](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/ner.v2.jinja). ~~str~~ | -| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | -| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. Defaults to `None`. ~~Optional[Callable[[str], str]]~~ | -| `alignment_mode` | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~ | -| `case_sensitive_matching` | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~ | -| `single_match` | Whether to match an entity in the LLM's response only once (the first hit) or multiple times. Defaults to `False`. ~~bool~~ | +| Argument | Description | +| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `template` (NEW) | Custom prompt template to send to LLM model. Defaults to [ner.v2.jinja](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/ner.v2.jinja). ~~str~~ | +| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | +| `parse_responses` (NEW) | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[NERTask]]~~ | +| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `NERExample`. ~~Optional[Type[FewshotExample]]~~ | +| `scorer` (NEW) | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~ | +| `labels` | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~ | +| `label_definitions` (NEW) | Optional dict mapping a label to a description of that label. These descriptions are added to the prompt to help instruct the LLM on what to extract. Defaults to `None`. ~~Optional[Dict[str, str]]~~ | +| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. Defaults to `None`. ~~Optional[Callable[[str], str]]~~ | +| `alignment_mode` | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~ | +| `case_sensitive_matching` | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~ | +| `single_match` | Whether to match an entity in the LLM's response only once (the first hit) or multiple times. Defaults to `False`. ~~bool~~ | The parameters `alignment_mode`, `case_sensitive_matching` and `single_match` are identical to the [v1](#ner-v1) implementation. The format of few-shot @@ -467,14 +475,17 @@ few-shot prompting. > examples = null > ``` -| Argument | Description | -| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `labels` | Comma-separated list of labels. ~~str~~ | -| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | -| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~ | -| `alignment_mode` | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~ | -| `case_sensitive_matching` | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~ | -| `single_match` | Whether to match an entity in the LLM's response only once (the first hit) or multiple times. Defaults to `False`. ~~bool~~ | +| Argument | Description | +| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | +| `parse_responses` (NEW) | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[NERTask]]~~ | +| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `NERExample`. ~~Optional[Type[FewshotExample]]~~ | +| `scorer` (NEW) | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~ | +| `labels` | Comma-separated list of labels. ~~str~~ | +| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~ | +| `alignment_mode` | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~ | +| `case_sensitive_matching` | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~ | +| `single_match` | Whether to match an entity in the LLM's response only once (the first hit) or multiple times. Defaults to `False`. ~~bool~~ | The NER task implementation doesn't currently ask the LLM for specific offsets, but simply expects a list of strings that represent the enties in the document. @@ -539,17 +550,20 @@ support overlapping entities and store its annotations in `doc.spans`. > examples = null > ``` -| Argument | Description | -| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `labels` | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~ | -| `label_definitions` | Optional dict mapping a label to a description of that label. These descriptions are added to the prompt to help instruct the LLM on what to extract. Defaults to `None`. ~~Optional[Dict[str, str]]~~ | -| `template` | Custom prompt template to send to LLM model. Defaults to [`spancat.v3.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/spancat.v3.jinja). ~~str~~ | -| `description` (NEW) | A description of what to recognize or not recognize as entities. ~~str~~ | -| `spans_key` | Key of the `Doc.spans` dict to save the spans under. Defaults to `"sc"`. ~~str~~ | -| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | -| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~ | -| `alignment_mode` | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~ | -| `case_sensitive_matching` | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~ | +| Argument | Description | +| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `template` | Custom prompt template to send to LLM model. Defaults to [`spancat.v3.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/spancat.v3.jinja). ~~str~~ | +| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | +| `parse_responses` (NEW) | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[SpanCatTask]]~~ | +| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `SpanCatExample`. ~~Optional[Type[FewshotExample]]~~ | +| `scorer` (NEW) | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~ | +| `labels` | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~ | +| `label_definitions` | Optional dict mapping a label to a description of that label. These descriptions are added to the prompt to help instruct the LLM on what to extract. Defaults to `None`. ~~Optional[Dict[str, str]]~~ | +| `description` (NEW) | A description of what to recognize or not recognize as entities. ~~str~~ | +| `spans_key` | Key of the `Doc.spans` dict to save the spans under. Defaults to `"sc"`. ~~str~~ | +| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~ | +| `alignment_mode` | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~ | +| `case_sensitive_matching` | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~ | Note that the `single_match` parameter, used in v1 and v2, is not supported anymore, as the CoT parsing algorithm takes care of this automatically. @@ -568,17 +582,20 @@ support overlapping entities and store its annotations in `doc.spans`. > examples = null > ``` -| Argument | Description | -| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `labels` | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~ | -| `label_definitions` (NEW) | Optional dict mapping a label to a description of that label. These descriptions are added to the prompt to help instruct the LLM on what to extract. Defaults to `None`. ~~Optional[Dict[str, str]]~~ | -| `template` (NEW) | Custom prompt template to send to LLM model. Defaults to [`spancat.v2.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/spancat.v2.jinja). ~~str~~ | -| `spans_key` | Key of the `Doc.spans` dict to save the spans under. Defaults to `"sc"`. ~~str~~ | -| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | -| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~ | -| `alignment_mode` | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~ | -| `case_sensitive_matching` | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~ | -| `single_match` | Whether to match an entity in the LLM's response only once (the first hit) or multiple times. Defaults to `False`. ~~bool~~ | +| Argument | Description | +| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `template` (NEW) | Custom prompt template to send to LLM model. Defaults to [`spancat.v2.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/spancat.v2.jinja). ~~str~~ | +| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | +| `parse_responses` (NEW) | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[SpanCatTask]]~~ | +| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `SpanCatExample`. ~~Optional[Type[FewshotExample]]~~ | +| `scorer` (NEW) | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~ | +| `labels` | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~ | +| `label_definitions` (NEW) | Optional dict mapping a label to a description of that label. These descriptions are added to the prompt to help instruct the LLM on what to extract. Defaults to `None`. ~~Optional[Dict[str, str]]~~ | +| `spans_key` | Key of the `Doc.spans` dict to save the spans under. Defaults to `"sc"`. ~~str~~ | +| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~ | +| `alignment_mode` | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~ | +| `case_sensitive_matching` | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~ | +| `single_match` | Whether to match an entity in the LLM's response only once (the first hit) or multiple times. Defaults to `False`. ~~bool~~ | Except for the `spans_key` parameter, the SpanCat v2 task reuses the configuration from the NER v2 task. Refer to [its documentation](#ner-v2) for @@ -599,15 +616,18 @@ v1 NER task to support overlapping entities and store its annotations in > examples = null > ``` -| Argument | Description | -| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `labels` | Comma-separated list of labels. ~~str~~ | -| `spans_key` | Key of the `Doc.spans` dict to save the spans under. Defaults to `"sc"`. ~~str~~ | -| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | -| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~ | -| `alignment_mode` | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~ | -| `case_sensitive_matching` | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~ | -| `single_match` | Whether to match an entity in the LLM's response only once (the first hit) or multiple times. Defaults to `False`. ~~bool~~ | +| Argument | Description | +| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | +| `parse_responses` (NEW) | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[SpanCatTask]]~~ | +| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `SpanCatExample`. ~~Optional[Type[FewshotExample]]~~ | +| `scorer` (NEW) | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~ | +| `labels` | Comma-separated list of labels. ~~str~~ | +| `spans_key` | Key of the `Doc.spans` dict to save the spans under. Defaults to `"sc"`. ~~str~~ | +| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~ | +| `alignment_mode` | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~ | +| `case_sensitive_matching` | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~ | +| `single_match` | Whether to match an entity in the LLM's response only once (the first hit) or multiple times. Defaults to `False`. ~~bool~~ | Except for the `spans_key` parameter, the SpanCat v1 task reuses the configuration from the NER v1 task. Refer to [its documentation](#ner-v1) for @@ -636,16 +656,19 @@ prompt. > examples = null > ``` -| Argument | Description | -| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `labels` | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~ | -| `label_definitions` (NEW) | Dictionary of label definitions. Included in the prompt, if set. Defaults to `None`. ~~Optional[Dict[str, str]]~~ | -| `template` | Custom prompt template to send to LLM model. Defaults to [`textcat.v3.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/textcat.v3.jinja). ~~str~~ | -| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | -| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, falls back to `spacy.LowercaseNormalizer.v1`. Defaults to `None`. ~~Optional[Callable[[str], str]]~~ | -| `exclusive_classes` | If set to `True`, only one label per document should be valid. If set to `False`, one document can have multiple labels. Defaults to `False`. ~~bool~~ | -| `allow_none` | When set to `True`, allows the LLM to not return any of the given label. The resulting dict in `doc.cats` will have `0.0` scores for all labels. Defaults to `True`. ~~bool~~ | -| `verbose` | If set to `True`, warnings will be generated when the LLM returns invalid responses. Defaults to `False`. ~~bool~~ | +| Argument | Description | +| --------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `template` | Custom prompt template to send to LLM model. Defaults to [`textcat.v3.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/textcat.v3.jinja). ~~str~~ | +| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | +| `parse_responses` (NEW) | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[SpanCatTask]]~~ | +| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `TextCatExample`. ~~Optional[Type[FewshotExample]]~~ | +| `scorer` (NEW) | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~ | +| `labels` | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~ | +| `label_definitions` (NEW) | Dictionary of label definitions. Included in the prompt, if set. Defaults to `None`. ~~Optional[Dict[str, str]]~~ | +| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, falls back to `spacy.LowercaseNormalizer.v1`. Defaults to `None`. ~~Optional[Callable[[str], str]]~~ | +| `exclusive_classes` | If set to `True`, only one label per document should be valid. If set to `False`, one document can have multiple labels. Defaults to `False`. ~~bool~~ | +| `allow_none` | When set to `True`, allows the LLM to not return any of the given label. The resulting dict in `doc.cats` will have `0.0` scores for all labels. Defaults to `True`. ~~bool~~ | +| `verbose` | If set to `True`, warnings will be generated when the LLM returns invalid responses. Defaults to `False`. ~~bool~~ | The formatting of few-shot examples is the same as those for the [v1](#textcat-v1) implementation. @@ -663,15 +686,18 @@ V2 includes all v1 functionality, with an improved prompt template. > examples = null > ``` -| Argument | Description | -| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `labels` | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~ | -| `template` (NEW) | Custom prompt template to send to LLM model. Defaults to [`textcat.v2.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/textcat.v2.jinja). ~~str~~ | -| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | -| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, falls back to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~ | -| `exclusive_classes` | If set to `True`, only one label per document should be valid. If set to `False`, one document can have multiple labels. Defaults to `False`. ~~bool~~ | -| `allow_none` | When set to `True`, allows the LLM to not return any of the given label. The resulting dict in `doc.cats` will have `0.0` scores for all labels. Defaults to `True`. ~~bool~~ | -| `verbose` | If set to `True`, warnings will be generated when the LLM returns invalid responses. Defaults to `False`. ~~bool~~ | +| Argument | Description | +| --------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `template` (NEW) | Custom prompt template to send to LLM model. Defaults to [`textcat.v2.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/textcat.v2.jinja). ~~str~~ | +| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | +| `parse_responses` (NEW) | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[SpanCatTask]]~~ | +| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `TextCatExample`. ~~Optional[Type[FewshotExample]]~~ | +| `scorer` (NEW) | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~ | +| `labels` | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~ | +| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, falls back to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~ | +| `exclusive_classes` | If set to `True`, only one label per document should be valid. If set to `False`, one document can have multiple labels. Defaults to `False`. ~~bool~~ | +| `allow_none` | When set to `True`, allows the LLM to not return any of the given label. The resulting dict in `doc.cats` will have `0.0` scores for all labels. Defaults to `True`. ~~bool~~ | +| `verbose` | If set to `True`, warnings will be generated when the LLM returns invalid responses. Defaults to `False`. ~~bool~~ | The formatting of few-shot examples is the same as those for the [v1](#textcat-v1) implementation. @@ -690,14 +716,17 @@ prompting. > examples = null > ``` -| Argument | Description | -| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `labels` | Comma-separated list of labels. ~~str~~ | -| `examples` | Optional function that generates examples for few-shot learning. Deafults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | -| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, falls back to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~ | -| `exclusive_classes` | If set to `True`, only one label per document should be valid. If set to `False`, one document can have multiple labels. Deafults to `False`. ~~bool~~ | -| `allow_none` | When set to `True`, allows the LLM to not return any of the given label. The resulting dict in `doc.cats` will have `0.0` scores for all labels. Deafults to `True`. ~~bool~~ | -| `verbose` | If set to `True`, warnings will be generated when the LLM returns invalid responses. Deafults to `False`. ~~bool~~ | +| Argument | Description | +| --------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | Optional function that generates examples for few-shot learning. Deafults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | +| `parse_responses` (NEW) | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[SpanCatTask]]~~ | +| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `TextCatExample`. ~~Optional[Type[FewshotExample]]~~ | +| `scorer` (NEW) | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~ | +| `labels` | Comma-separated list of labels. ~~str~~ | +| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, falls back to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~ | +| `exclusive_classes` | If set to `True`, only one label per document should be valid. If set to `False`, one document can have multiple labels. Defaults to `False`. ~~bool~~ | +| `allow_none` | When set to `True`, allows the LLM to not return any of the given label. The resulting dict in `doc.cats` will have `0.0` scores for all labels. Defaults to `True`. ~~bool~~ | +| `verbose` | If set to `True`, warnings will be generated when the LLM returns invalid responses. Defaults to `False`. ~~bool~~ | To perform [few-shot learning](/usage/large-language-models#few-shot-prompts), you can write down a few examples in a separate file, and provide these to be @@ -740,14 +769,17 @@ on an upstream NER component for entities extraction. > labels = ["LivesIn", "Visits"] > ``` -| Argument | Description | -| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `labels` | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~ | -| `template` | Custom prompt template to send to LLM model. Defaults to [`rel.v3.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/rel.v1.jinja). ~~str~~ | -| `label_definitions` | Dictionary providing a description for each relation label. Defaults to `None`. ~~Optional[Dict[str, str]]~~ | -| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | -| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, falls back to `spacy.LowercaseNormalizer.v1`. Defaults to `None`. ~~Optional[Callable[[str], str]]~~ | -| `verbose` | If set to `True`, warnings will be generated when the LLM returns invalid responses. Defaults to `False`. ~~bool~~ | +| Argument | Description | +| --------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `template` | Custom prompt template to send to LLM model. Defaults to [`rel.v3.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/rel.v1.jinja). ~~str~~ | +| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | +| `parse_responses` (NEW) | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[RELTask]]~~ | +| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `RELExample`. ~~Optional[Type[FewshotExample]]~~ | +| `scorer` (NEW) | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~ | +| `labels` | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~ | +| `label_definitions` | Dictionary providing a description for each relation label. Defaults to `None`. ~~Optional[Dict[str, str]]~~ | +| `normalizer` | Function that normalizes the labels as returned by the LLM. If `None`, falls back to `spacy.LowercaseNormalizer.v1`. Defaults to `None`. ~~Optional[Callable[[str], str]]~~ | +| `verbose` | If set to `True`, warnings will be generated when the LLM returns invalid responses. Defaults to `False`. ~~bool~~ | To perform [few-shot learning](/usage/large-language-models#few-shot-prompts), you can write down a few examples in a separate file, and provide these to be @@ -793,10 +825,13 @@ This task supports both zero-shot and few-shot prompting. > examples = null > ``` -| Argument | Description | -| ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `template` | Custom prompt template to send to LLM model. Defaults to [lemma.v1.jinja](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/lemma.v1.jinja). ~~str~~ | -| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | +| Argument | Description | +| --------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `template` | Custom prompt template to send to LLM model. Defaults to [lemma.v1.jinja](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/lemma.v1.jinja). ~~str~~ | +| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | +| `parse_responses` (NEW) | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[LemmaTask]]~~ | +| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `LemmaExample`. ~~Optional[Type[FewshotExample]]~~ | +| `scorer` (NEW) | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~ | The task prompts the LLM to lemmatize the passed text and return the lemmatized version as a list of tokens and their corresponding lemma. E. g. the text @@ -870,11 +905,14 @@ This task supports both zero-shot and few-shot prompting. > examples = null > ``` -| Argument | Description | -| ---------- | ------------------------------------------------------------------------------------------------------------------------------------------ | -| `template` | Custom prompt template to send to LLM model. Defaults to [sentiment.v1.jinja](./spacy_llm/tasks/templates/sentiment.v1.jinja). ~~str~~ | -| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | -| `field` | Name of extension attribute to store summary in (i. e. the summary will be available in `doc._.{field}`). Defaults to `sentiment`. ~~str~~ | +| Argument | Description | +| --------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `template` | Custom prompt template to send to LLM model. Defaults to [sentiment.v1.jinja](./spacy_llm/tasks/templates/sentiment.v1.jinja). ~~str~~ | +| `examples` | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~ | +| `parse_responses` (NEW) | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[SentimentTask]]~~ | +| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `SentimentExample`. ~~Optional[Type[FewshotExample]]~~ | +| `scorer` (NEW) | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~ | +| `field` | Name of extension attribute to store summary in (i. e. the summary will be available in `doc._.{field}`). Defaults to `sentiment`. ~~str~~ | To perform [few-shot learning](/usage/large-language-models#few-shot-prompts), you can write down a few examples in a separate file, and provide these to be @@ -1042,6 +1080,21 @@ Currently, these models are provided as part of the core library: | `spacy.StableLM.v1` | Stability AI | `["stablelm-base-alpha-3b", "stablelm-base-alpha-7b", "stablelm-tuned-alpha-3b", "stablelm-tuned-alpha-7b"]` | https://huggingface.co/stabilityai | | `spacy.OpenLLaMA.v1` | OpenLM Research | `["open_llama_3b", "open_llama_7b", "open_llama_7b_v2", "open_llama_13b"]` | https://huggingface.co/openlm-research | + + +Some models available on Hugging Face (HF), such as Llama 2, are _gated models_. +That means that users have to fulfill certain requirements to be allowed access +to these models. In the case of Llama 2 you'll need to request agree to Meta's +Terms of Service while logged in with your HF account. After Meta grants you +permission to use Llama 2, you'll be able to download and use the model. + +This requires that you are logged in with your HF account on your local +machine - check out the HF quick start documentation. In a nutshell, you'll need +to create an access token on HF and log in to HF using your access token, e. g. +with `huggingface-cli login`. + + + Note that Hugging Face will download the model the first time you use it - you can [define the cached directory](https://huggingface.co/docs/huggingface_hub/main/en/guides/manage-cache) From 6e54360a3d068c2b85b45902f8885b8db043372f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 5 Oct 2023 08:50:22 +0200 Subject: [PATCH 08/25] Remove pathy dependency, update docs for cloudpathlib in Weasel (#13035) --- requirements.txt | 1 - setup.cfg | 1 - spacy/cli/_util.py | 4 ---- website/docs/api/cli.mdx | 6 +++--- website/docs/usage/projects.mdx | 6 +++--- 5 files changed, 6 insertions(+), 12 deletions(-) diff --git a/requirements.txt b/requirements.txt index a8ba956a1..3050624f9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,6 @@ wasabi>=0.9.1,<1.2.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 typer>=0.3.0,<0.10.0 -pathy>=0.10.0 smart-open>=5.2.1,<7.0.0 weasel>=0.1.0,<0.4.0 # Third party dependencies diff --git a/setup.cfg b/setup.cfg index 75f2e3a15..ab9e39e0c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -56,7 +56,6 @@ install_requires = weasel>=0.1.0,<0.4.0 # Third-party dependencies typer>=0.3.0,<0.10.0 - pathy>=0.10.0 smart-open>=5.2.1,<7.0.0 tqdm>=4.38.0,<5.0.0 numpy>=1.15.0; python_version < "3.9" diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index bc6c53cd9..fa41e6a08 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -41,10 +41,6 @@ from ..util import ( run_command, ) -if TYPE_CHECKING: - from pathy import FluidPath # noqa: F401 - - SDIST_SUFFIX = ".tar.gz" WHEEL_SUFFIX = "-py3-none-any.whl" diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 3ec0081c9..51cae960b 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -1544,9 +1544,9 @@ obsolete files is left up to you. Remotes can be defined in the `remotes` section of the [`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses -[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the -remote storages, so you can use any protocol that `Pathy` supports, including -[S3](https://aws.amazon.com/s3/), +[`cloudpathlib`](https://cloudpathlib.drivendata.org) to communicate with the +remote storages, so you can use any protocol that `cloudpathlib` supports, +including [S3](https://aws.amazon.com/s3/), [Google Cloud Storage](https://cloud.google.com/storage), and the local filesystem, although you may need to install extra dependencies to use certain protocols. diff --git a/website/docs/usage/projects.mdx b/website/docs/usage/projects.mdx index f3cca8013..b089a7ab5 100644 --- a/website/docs/usage/projects.mdx +++ b/website/docs/usage/projects.mdx @@ -656,9 +656,9 @@ locally. You can list one or more remotes in the `remotes` section of your [`project.yml`](#project-yml) by mapping a string name to the URL of the storage. Under the hood, spaCy uses -[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the -remote storages, so you can use any protocol that `Pathy` supports, including -[S3](https://aws.amazon.com/s3/), +[`cloudpathlib`](https://cloudpathlib.drivendata.org) to communicate with the +remote storages, so you can use any protocol that `cloudpathlib` supports, +including [S3](https://aws.amazon.com/s3/), [Google Cloud Storage](https://cloud.google.com/storage), and the local filesystem, although you may need to install extra dependencies to use certain protocols. From 1dec138e61f41963096772cd096e1a1e07ae2ce9 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 5 Oct 2023 08:50:41 +0200 Subject: [PATCH 09/25] Update docs w.r.t. PaLM support. (#13018) --- website/docs/api/large-language-models.mdx | 7 +++++++ website/docs/usage/large-language-models.mdx | 5 +++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/website/docs/api/large-language-models.mdx b/website/docs/api/large-language-models.mdx index 845edaa1a..aac4c5108 100644 --- a/website/docs/api/large-language-models.mdx +++ b/website/docs/api/large-language-models.mdx @@ -1022,6 +1022,7 @@ Currently, these models are provided as part of the core library: | `spacy.Claude-1-3.v1` | Anthropic | `["claude-1.3", "claude-1.3-100k"]` | `"claude-1.3"` | `{}` | | `spacy.Claude-instant-1.v1` | Anthropic | `["claude-instant-1", "claude-instant-1-100k"]` | `"claude-instant-1"` | `{}` | | `spacy.Claude-instant-1-1.v1` | Anthropic | `["claude-instant-1.1", "claude-instant-1.1-100k"]` | `"claude-instant-1.1"` | `{}` | +| `spacy.PaLM.v1` | Google | `["chat-bison-001", "text-bison-001"]` | `"text-bison-001"` | `{temperature=0.0}` | To use these models, make sure that you've [set the relevant API](#api-keys) keys as environment variables. @@ -1052,6 +1053,12 @@ For Anthropic: export ANTHROPIC_API_KEY="..." ``` +For PaLM: + +```shell +export PALM_API_KEY="..." +``` + ### Models via HuggingFace {id="models-hf"} These models all take the same parameters: diff --git a/website/docs/usage/large-language-models.mdx b/website/docs/usage/large-language-models.mdx index 86f44f5ae..35117ef57 100644 --- a/website/docs/usage/large-language-models.mdx +++ b/website/docs/usage/large-language-models.mdx @@ -170,8 +170,8 @@ to be `"databricks/dolly-v2-12b"` for better performance. ### Example 3: Create the component directly in Python {id="example-3"} The `llm` component behaves as any other component does, and there are -[task-specific components](/api/large-language-models#config) defined to -help you hit the ground running with a reasonable built-in task implementation. +[task-specific components](/api/large-language-models#config) defined to help +you hit the ground running with a reasonable built-in task implementation. ```python import spacy @@ -484,6 +484,7 @@ provider's documentation. | [`spacy.Claude-1-0.v1`](/api/large-language-models#models-rest) | Anthropic’s `claude-1.0` model family. | | [`spacy.Claude-1-2.v1`](/api/large-language-models#models-rest) | Anthropic’s `claude-1.2` model family. | | [`spacy.Claude-1-3.v1`](/api/large-language-models#models-rest) | Anthropic’s `claude-1.3` model family. | +| [`spacy.PaLM.v1`](/api/large-language-models#models-rest) | Google’s `PaLM` model family. | | [`spacy.Dolly.v1`](/api/large-language-models#models-hf) | Dolly models through HuggingFace. | | [`spacy.Falcon.v1`](/api/large-language-models#models-hf) | Falcon models through HuggingFace. | | [`spacy.Llama2.v1`](/api/large-language-models#models-hf) | Llama2 models through HuggingFace. | From 862f8254e8498fc426a406f56d44b350d830e852 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 5 Oct 2023 13:18:27 +0200 Subject: [PATCH 10/25] Add docs on Azure OpenAI support in `spacy-llm` (#13043) * Add gpt-3.5-turbo-instruct to list of supported OpenAI models. * Update `spacy-llm` task argument docs w.r.t. task refactoring (#12995) * Update task arguments w.r.t. task refactoring in 0.5.0. * Add disclaimer w.r.t. gated models/Llama 2. * Update website/docs/api/large-language-models.mdx * Update website/docs/api/large-language-models.mdx * Update docs w.r.t. PaLM support. (#13018) * Add info on spacy.Azure.v1. * Attempt to fix netlify check fails. * Attempt to fix netlify check fails. * Attempt to fix netlify check fails. * Attempt to fix netlify check fails. * Attempt to fix netlify check fails. * Attempt to fix netlify check fails. * Attempt to fix netlify check fails. * Attempt to fix netlify check fails. * Attempt to fix netlify check fails. * Format. --- website/docs/api/large-language-models.mdx | 85 ++++++++++++-------- website/docs/usage/large-language-models.mdx | 1 + 2 files changed, 53 insertions(+), 33 deletions(-) diff --git a/website/docs/api/large-language-models.mdx b/website/docs/api/large-language-models.mdx index aac4c5108..c5d106e29 100644 --- a/website/docs/api/large-language-models.mdx +++ b/website/docs/api/large-language-models.mdx @@ -990,43 +990,62 @@ provider's API. Currently, these models are provided as part of the core library: -| Model | Provider | Supported names | Default name | Default config | -| ----------------------------- | --------- | ------------------------------------------------------------------------------------------------------------------ | ---------------------- | ------------------------------------ | -| `spacy.GPT-4.v1` | OpenAI | `["gpt-4", "gpt-4-0314", "gpt-4-32k", "gpt-4-32k-0314"]` | `"gpt-4"` | `{}` | -| `spacy.GPT-4.v2` | OpenAI | `["gpt-4", "gpt-4-0314", "gpt-4-32k", "gpt-4-32k-0314"]` | `"gpt-4"` | `{temperature=0.0}` | -| `spacy.GPT-3-5.v1` | OpenAI | `["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-0613-16k", "gpt-3.5-turbo-instruct"]` | `"gpt-3.5-turbo"` | `{}` | -| `spacy.GPT-3-5.v2` | OpenAI | `["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-0613-16k", "gpt-3.5-turbo-instruct"]` | `"gpt-3.5-turbo"` | `{temperature=0.0}` | -| `spacy.Davinci.v1` | OpenAI | `["davinci"]` | `"davinci"` | `{}` | -| `spacy.Davinci.v2` | OpenAI | `["davinci"]` | `"davinci"` | `{temperature=0.0, max_tokens=500}` | -| `spacy.Text-Davinci.v1` | OpenAI | `["text-davinci-003", "text-davinci-002"]` | `"text-davinci-003"` | `{}` | -| `spacy.Text-Davinci.v2` | OpenAI | `["text-davinci-003", "text-davinci-002"]` | `"text-davinci-003"` | `{temperature=0.0, max_tokens=1000}` | -| `spacy.Code-Davinci.v1` | OpenAI | `["code-davinci-002"]` | `"code-davinci-002"` | `{}` | -| `spacy.Code-Davinci.v2` | OpenAI | `["code-davinci-002"]` | `"code-davinci-002"` | `{temperature=0.0, max_tokens=500}` | -| `spacy.Curie.v1` | OpenAI | `["curie"]` | `"curie"` | `{}` | -| `spacy.Curie.v2` | OpenAI | `["curie"]` | `"curie"` | `{temperature=0.0, max_tokens=500}` | -| `spacy.Text-Curie.v1` | OpenAI | `["text-curie-001"]` | `"text-curie-001"` | `{}` | -| `spacy.Text-Curie.v2` | OpenAI | `["text-curie-001"]` | `"text-curie-001"` | `{temperature=0.0, max_tokens=500}` | -| `spacy.Babbage.v1` | OpenAI | `["babbage"]` | `"babbage"` | `{}` | -| `spacy.Babbage.v2` | OpenAI | `["babbage"]` | `"babbage"` | `{temperature=0.0, max_tokens=500}` | -| `spacy.Text-Babbage.v1` | OpenAI | `["text-babbage-001"]` | `"text-babbage-001"` | `{}` | -| `spacy.Text-Babbage.v2` | OpenAI | `["text-babbage-001"]` | `"text-babbage-001"` | `{temperature=0.0, max_tokens=500}` | -| `spacy.Ada.v1` | OpenAI | `["ada"]` | `"ada"` | `{}` | -| `spacy.Ada.v2` | OpenAI | `["ada"]` | `"ada"` | `{temperature=0.0, max_tokens=500}` | -| `spacy.Text-Ada.v1` | OpenAI | `["text-ada-001"]` | `"text-ada-001"` | `{}` | -| `spacy.Text-Ada.v2` | OpenAI | `["text-ada-001"]` | `"text-ada-001"` | `{temperature=0.0, max_tokens=500}` | -| `spacy.Command.v1` | Cohere | `["command", "command-light", "command-light-nightly", "command-nightly"]` | `"command"` | `{}` | -| `spacy.Claude-2.v1` | Anthropic | `["claude-2", "claude-2-100k"]` | `"claude-2"` | `{}` | -| `spacy.Claude-1.v1` | Anthropic | `["claude-1", "claude-1-100k"]` | `"claude-1"` | `{}` | -| `spacy.Claude-1-0.v1` | Anthropic | `["claude-1.0"]` | `"claude-1.0"` | `{}` | -| `spacy.Claude-1-2.v1` | Anthropic | `["claude-1.2"]` | `"claude-1.2"` | `{}` | -| `spacy.Claude-1-3.v1` | Anthropic | `["claude-1.3", "claude-1.3-100k"]` | `"claude-1.3"` | `{}` | -| `spacy.Claude-instant-1.v1` | Anthropic | `["claude-instant-1", "claude-instant-1-100k"]` | `"claude-instant-1"` | `{}` | -| `spacy.Claude-instant-1-1.v1` | Anthropic | `["claude-instant-1.1", "claude-instant-1.1-100k"]` | `"claude-instant-1.1"` | `{}` | -| `spacy.PaLM.v1` | Google | `["chat-bison-001", "text-bison-001"]` | `"text-bison-001"` | `{temperature=0.0}` | +| Model | Provider | Supported names | Default name | Default config | +| ----------------------------- | ----------------- | ------------------------------------------------------------------------------------------------------------------ | ---------------------- | ------------------------------------ | +| `spacy.GPT-4.v1` | OpenAI | `["gpt-4", "gpt-4-0314", "gpt-4-32k", "gpt-4-32k-0314"]` | `"gpt-4"` | `{}` | +| `spacy.GPT-4.v2` | OpenAI | `["gpt-4", "gpt-4-0314", "gpt-4-32k", "gpt-4-32k-0314"]` | `"gpt-4"` | `{temperature=0.0}` | +| `spacy.GPT-3-5.v1` | OpenAI | `["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-0613-16k", "gpt-3.5-turbo-instruct"]` | `"gpt-3.5-turbo"` | `{}` | +| `spacy.GPT-3-5.v2` | OpenAI | `["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-0613-16k", "gpt-3.5-turbo-instruct"]` | `"gpt-3.5-turbo"` | `{temperature=0.0}` | +| `spacy.Davinci.v1` | OpenAI | `["davinci"]` | `"davinci"` | `{}` | +| `spacy.Davinci.v2` | OpenAI | `["davinci"]` | `"davinci"` | `{temperature=0.0, max_tokens=500}` | +| `spacy.Text-Davinci.v1` | OpenAI | `["text-davinci-003", "text-davinci-002"]` | `"text-davinci-003"` | `{}` | +| `spacy.Text-Davinci.v2` | OpenAI | `["text-davinci-003", "text-davinci-002"]` | `"text-davinci-003"` | `{temperature=0.0, max_tokens=1000}` | +| `spacy.Code-Davinci.v1` | OpenAI | `["code-davinci-002"]` | `"code-davinci-002"` | `{}` | +| `spacy.Code-Davinci.v2` | OpenAI | `["code-davinci-002"]` | `"code-davinci-002"` | `{temperature=0.0, max_tokens=500}` | +| `spacy.Curie.v1` | OpenAI | `["curie"]` | `"curie"` | `{}` | +| `spacy.Curie.v2` | OpenAI | `["curie"]` | `"curie"` | `{temperature=0.0, max_tokens=500}` | +| `spacy.Text-Curie.v1` | OpenAI | `["text-curie-001"]` | `"text-curie-001"` | `{}` | +| `spacy.Text-Curie.v2` | OpenAI | `["text-curie-001"]` | `"text-curie-001"` | `{temperature=0.0, max_tokens=500}` | +| `spacy.Babbage.v1` | OpenAI | `["babbage"]` | `"babbage"` | `{}` | +| `spacy.Babbage.v2` | OpenAI | `["babbage"]` | `"babbage"` | `{temperature=0.0, max_tokens=500}` | +| `spacy.Text-Babbage.v1` | OpenAI | `["text-babbage-001"]` | `"text-babbage-001"` | `{}` | +| `spacy.Text-Babbage.v2` | OpenAI | `["text-babbage-001"]` | `"text-babbage-001"` | `{temperature=0.0, max_tokens=500}` | +| `spacy.Ada.v1` | OpenAI | `["ada"]` | `"ada"` | `{}` | +| `spacy.Ada.v2` | OpenAI | `["ada"]` | `"ada"` | `{temperature=0.0, max_tokens=500}` | +| `spacy.Text-Ada.v1` | OpenAI | `["text-ada-001"]` | `"text-ada-001"` | `{}` | +| `spacy.Text-Ada.v2` | OpenAI | `["text-ada-001"]` | `"text-ada-001"` | `{temperature=0.0, max_tokens=500}` | +| `spacy.Azure.v1` | Microsoft, OpenAI | Arbitrary values | No default | `{temperature=0.0}` | +| `spacy.Command.v1` | Cohere | `["command", "command-light", "command-light-nightly", "command-nightly"]` | `"command"` | `{}` | +| `spacy.Claude-2.v1` | Anthropic | `["claude-2", "claude-2-100k"]` | `"claude-2"` | `{}` | +| `spacy.Claude-1.v1` | Anthropic | `["claude-1", "claude-1-100k"]` | `"claude-1"` | `{}` | +| `spacy.Claude-1-0.v1` | Anthropic | `["claude-1.0"]` | `"claude-1.0"` | `{}` | +| `spacy.Claude-1-2.v1` | Anthropic | `["claude-1.2"]` | `"claude-1.2"` | `{}` | +| `spacy.Claude-1-3.v1` | Anthropic | `["claude-1.3", "claude-1.3-100k"]` | `"claude-1.3"` | `{}` | +| `spacy.Claude-instant-1.v1` | Anthropic | `["claude-instant-1", "claude-instant-1-100k"]` | `"claude-instant-1"` | `{}` | +| `spacy.Claude-instant-1-1.v1` | Anthropic | `["claude-instant-1.1", "claude-instant-1.1-100k"]` | `"claude-instant-1.1"` | `{}` | +| `spacy.PaLM.v1` | Google | `["chat-bison-001", "text-bison-001"]` | `"text-bison-001"` | `{temperature=0.0}` | To use these models, make sure that you've [set the relevant API](#api-keys) keys as environment variables. +**⚠️ A note on `spacy.Azure.v1`.** Working with Azure OpenAI is slightly +different than working with models from other providers: + +- In Azure LLMs have to be made available by creating a _deployment_ of a given + model (e. g. GPT-3.5). This deployment can have an arbitrary name. The `name` + argument, which everywhere else denotes the model name (e. g. `claude-1.0`, + `gpt-3.5`), here refers to the _deployment name_. +- Deployed Azure OpenAI models are reachable via a resource-specific base URL, + usually of the form `https://{resource}.openai.azure.com`. Hence the URL has + to be specified via the `base_url` argument. +- Azure further expects the _API version_ to be specified. The default value for + this, via the `api_version` argument, is currently `2023-05-15` but may be + updated in the future. +- Finally, since we can't infer information about the model from the deployment + name, `spacy-llm` requires the `model_type` to be set to either + `"completions"` or `"chat"`, depending on whether the deployed model is a + completion or chat model. + #### API Keys {id="api-keys"} Note that when using hosted services, you have to ensure that the proper API diff --git a/website/docs/usage/large-language-models.mdx b/website/docs/usage/large-language-models.mdx index 35117ef57..875ff33d4 100644 --- a/website/docs/usage/large-language-models.mdx +++ b/website/docs/usage/large-language-models.mdx @@ -476,6 +476,7 @@ provider's documentation. | [`spacy.Curie.v2`](/api/large-language-models#models-rest) | OpenAI’s `curie` model family. | | [`spacy.Babbage.v2`](/api/large-language-models#models-rest) | OpenAI’s `babbage` model family. | | [`spacy.Ada.v2`](/api/large-language-models#models-rest) | OpenAI’s `ada` model family. | +| [`spacy.Azure.v1`](/api/large-language-models#models-rest) | Azure's OpenAI models. | | [`spacy.Command.v1`](/api/large-language-models#models-rest) | Cohere’s `command` model family. | | [`spacy.Claude-2.v1`](/api/large-language-models#models-rest) | Anthropic’s `claude-2` model family. | | [`spacy.Claude-1.v1`](/api/large-language-models#models-rest) | Anthropic’s `claude-1` model family. | From 1162fcf0994dd7f83a744eddd0bdd31bccd5ca29 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 5 Oct 2023 14:44:38 +0200 Subject: [PATCH 11/25] Add Mistral mentions. (#13037) --- website/docs/api/large-language-models.mdx | 3 ++- website/docs/usage/large-language-models.mdx | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/website/docs/api/large-language-models.mdx b/website/docs/api/large-language-models.mdx index c5d106e29..f8404cb2e 100644 --- a/website/docs/api/large-language-models.mdx +++ b/website/docs/api/large-language-models.mdx @@ -1101,8 +1101,9 @@ Currently, these models are provided as part of the core library: | Model | Provider | Supported names | HF directory | | -------------------- | --------------- | ------------------------------------------------------------------------------------------------------------ | -------------------------------------- | | `spacy.Dolly.v1` | Databricks | `["dolly-v2-3b", "dolly-v2-7b", "dolly-v2-12b"]` | https://huggingface.co/databricks | -| `spacy.Llama2.v1` | Meta AI | `["Llama-2-7b-hf", "Llama-2-13b-hf", "Llama-2-70b-hf"]` | https://huggingface.co/meta-llama | | `spacy.Falcon.v1` | TII | `["falcon-rw-1b", "falcon-7b", "falcon-7b-instruct", "falcon-40b-instruct"]` | https://huggingface.co/tiiuae | +| `spacy.Llama2.v1` | Meta AI | `["Llama-2-7b-hf", "Llama-2-13b-hf", "Llama-2-70b-hf"]` | https://huggingface.co/meta-llama | +| `spacy.Mistral.v1` | Mistral AI | `["Mistral-7B-v0.1", "Mistral-7B-Instruct-v0.1"]` | https://huggingface.co/mistralai | | `spacy.StableLM.v1` | Stability AI | `["stablelm-base-alpha-3b", "stablelm-base-alpha-7b", "stablelm-tuned-alpha-3b", "stablelm-tuned-alpha-7b"]` | https://huggingface.co/stabilityai | | `spacy.OpenLLaMA.v1` | OpenLM Research | `["open_llama_3b", "open_llama_7b", "open_llama_7b_v2", "open_llama_13b"]` | https://huggingface.co/openlm-research | diff --git a/website/docs/usage/large-language-models.mdx b/website/docs/usage/large-language-models.mdx index 875ff33d4..94494b4e1 100644 --- a/website/docs/usage/large-language-models.mdx +++ b/website/docs/usage/large-language-models.mdx @@ -436,7 +436,7 @@ respectively. Alternatively you can use LangChain to access hosted or local models by specifying one of the models registered with the `langchain.` prefix. -_Why LangChain if there are also are a native REST and a HuggingFace interface? When should I use what?_ +_Why LangChain if there are also are native REST and HuggingFace interfaces? When should I use what?_ Third-party libraries like `langchain` focus on prompt management, integration of many different LLM APIs, and other related features such as conversational @@ -488,6 +488,7 @@ provider's documentation. | [`spacy.PaLM.v1`](/api/large-language-models#models-rest) | Google’s `PaLM` model family. | | [`spacy.Dolly.v1`](/api/large-language-models#models-hf) | Dolly models through HuggingFace. | | [`spacy.Falcon.v1`](/api/large-language-models#models-hf) | Falcon models through HuggingFace. | +| [`spacy.Mistral.v1`](/api/large-language-models#models-hf) | Mistral models through HuggingFace. | | [`spacy.Llama2.v1`](/api/large-language-models#models-hf) | Llama2 models through HuggingFace. | | [`spacy.StableLM.v1`](/api/large-language-models#models-hf) | StableLM models through HuggingFace. | | [`spacy.OpenLLaMA.v1`](/api/large-language-models#models-hf) | OpenLLaMA models through HuggingFace. | From b83f1e372490acd86c6ddcb7b9f5b10b2d50b4ab Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 6 Oct 2023 14:22:43 +0200 Subject: [PATCH 12/25] Inline displaCy visualizations in docs (#13050) [ci skip] --- .../images/displacy-long2.html | 0 website/docs/usage/101/_named-entities.mdx | 8 +- website/docs/usage/101/_pos-deps.mdx | 7 +- website/docs/usage/linguistic-features.mdx | 15 +- website/docs/usage/rule-based-matching.mdx | 7 +- website/docs/usage/saving-loading.mdx | 8 +- website/docs/usage/v3-3.mdx | 8 +- website/docs/usage/visualizers.mdx | 32 +-- ...-founded.html => displacy-dep-founded.svg} | 0 .../public/images/displacy-ent-custom.html | 80 ------- website/public/images/displacy-ent-snek.html | 59 ----- website/public/images/displacy-ent1.html | 84 ------- website/public/images/displacy-ent2.html | 86 ------- .../{displacy-long.html => displacy-long.svg} | 0 website/public/images/displacy-long2.svg | 212 ++++++++++++++++++ .../public/images/displacy-span-custom.html | 84 ------- website/public/images/displacy-span.html | 123 ---------- website/src/components/embed.js | 18 +- website/src/remark.js | 4 +- website/src/styles/embed.module.sass | 8 + 20 files changed, 273 insertions(+), 570 deletions(-) rename website/{public => docs}/images/displacy-long2.html (100%) rename website/public/images/{displacy-dep-founded.html => displacy-dep-founded.svg} (100%) delete mode 100644 website/public/images/displacy-ent-custom.html delete mode 100644 website/public/images/displacy-ent-snek.html delete mode 100644 website/public/images/displacy-ent1.html delete mode 100644 website/public/images/displacy-ent2.html rename website/public/images/{displacy-long.html => displacy-long.svg} (100%) create mode 100644 website/public/images/displacy-long2.svg delete mode 100644 website/public/images/displacy-span-custom.html delete mode 100644 website/public/images/displacy-span.html diff --git a/website/public/images/displacy-long2.html b/website/docs/images/displacy-long2.html similarity index 100% rename from website/public/images/displacy-long2.html rename to website/docs/images/displacy-long2.html diff --git a/website/docs/usage/101/_named-entities.mdx b/website/docs/usage/101/_named-entities.mdx index 9ae4134d8..da43c0ddd 100644 --- a/website/docs/usage/101/_named-entities.mdx +++ b/website/docs/usage/101/_named-entities.mdx @@ -31,8 +31,6 @@ for ent in doc.ents: Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what our example sentence and its named entities look like: -