Merge branch 'explosion:master' into feature/candidate-generation-by-docs

This commit is contained in:
Raphael Mitsch 2022-11-04 12:25:25 +01:00 committed by GitHub
commit 7a4ef51807
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 161 additions and 107 deletions

View File

@ -1,9 +1,7 @@
parameters: parameters:
python_version: '' python_version: ''
architecture: '' architecture: 'x64'
prefix: '' num_build_jobs: 2
gpu: false
num_build_jobs: 1
steps: steps:
- task: UsePythonVersion@0 - task: UsePythonVersion@0
@ -17,16 +15,16 @@ steps:
displayName: 'Set variables' displayName: 'Set variables'
- script: | - script: |
${{ parameters.prefix }} python -m pip install -U pip setuptools python -m pip install -U build pip setuptools
${{ parameters.prefix }} python -m pip install -U -r requirements.txt python -m pip install -U -r requirements.txt
displayName: "Install dependencies" displayName: "Install dependencies"
- script: | - script: |
${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }} python -m build --sdist
${{ parameters.prefix }} python setup.py sdist --formats=gztar displayName: "Build sdist"
displayName: "Compile and build sdist"
- script: python -m mypy spacy - script: |
python -m mypy spacy
displayName: 'Run mypy' displayName: 'Run mypy'
condition: ne(variables['python_version'], '3.6') condition: ne(variables['python_version'], '3.6')
@ -35,35 +33,24 @@ steps:
contents: "spacy" contents: "spacy"
displayName: "Delete source directory" displayName: "Delete source directory"
- task: DeleteFiles@1
inputs:
contents: "*.egg-info"
displayName: "Delete egg-info directory"
- script: | - script: |
${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt python -m pip freeze > installed.txt
${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt python -m pip uninstall -y -r installed.txt
displayName: "Uninstall all packages" displayName: "Uninstall all packages"
- bash: | - bash: |
${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
${{ parameters.prefix }} SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
displayName: "Install from sdist" displayName: "Install from sdist"
- script: | - script: |
${{ parameters.prefix }} python -m pip install -U -r requirements.txt python -W error -c "import spacy"
displayName: "Install test requirements" displayName: "Test import"
- script: |
${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
displayName: "Install GPU requirements"
condition: eq(${{ parameters.gpu }}, true)
- script: |
${{ parameters.prefix }} python -m pytest --pyargs spacy -W error
displayName: "Run CPU tests"
condition: eq(${{ parameters.gpu }}, false)
- script: |
${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu
displayName: "Run GPU tests"
condition: eq(${{ parameters.gpu }}, true)
- script: | - script: |
python -m spacy download ca_core_news_sm python -m spacy download ca_core_news_sm
@ -72,6 +59,11 @@ steps:
displayName: 'Test download CLI' displayName: 'Test download CLI'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.8')
- script: |
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
displayName: 'Test no warnings on load (#11713)'
condition: eq(variables['python_version'], '3.8')
- script: | - script: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
displayName: 'Test convert CLI' displayName: 'Test convert CLI'
@ -106,13 +98,22 @@ steps:
displayName: 'Test assemble CLI vectors warning' displayName: 'Test assemble CLI vectors warning'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.8')
- script: |
python -m pip install -U -r requirements.txt
displayName: "Install test requirements"
- script: |
python -m pytest --pyargs spacy -W error
displayName: "Run CPU tests"
- script: |
python -m pip install --pre thinc-apple-ops
python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
- script: | - script: |
python .github/validate_universe_json.py website/meta/universe.json python .github/validate_universe_json.py website/meta/universe.json
displayName: 'Test website/meta/universe.json' displayName: 'Test website/meta/universe.json'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.8')
- script: |
${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops
${{ parameters.prefix }} python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))

View File

@ -15,7 +15,7 @@ jobs:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
with: with:
ref: ${{ github.head_ref }} ref: ${{ github.head_ref }}
- uses: actions/setup-python@v3 - uses: actions/setup-python@v4
- run: pip install black - run: pip install black
- name: Auto-format code if needed - name: Auto-format code if needed
run: black spacy run: black spacy

View File

@ -8,14 +8,14 @@ on:
jobs: jobs:
explosion-bot: explosion-bot:
runs-on: ubuntu-18.04 runs-on: ubuntu-latest
steps: steps:
- name: Dump GitHub context - name: Dump GitHub context
env: env:
GITHUB_CONTEXT: ${{ toJson(github) }} GITHUB_CONTEXT: ${{ toJson(github) }}
run: echo "$GITHUB_CONTEXT" run: echo "$GITHUB_CONTEXT"
- uses: actions/checkout@v1 - uses: actions/checkout@v3
- uses: actions/setup-python@v1 - uses: actions/setup-python@v4
- name: Install and run explosion-bot - name: Install and run explosion-bot
run: | run: |
pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot

View File

@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v1 uses: actions/checkout@v3
with: with:
ref: ${{ matrix.branch }} ref: ${{ matrix.branch }}
- name: Get commits from past 24 hours - name: Get commits from past 24 hours
@ -23,9 +23,9 @@ jobs:
today=$(date '+%Y-%m-%d %H:%M:%S') today=$(date '+%Y-%m-%d %H:%M:%S')
yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S') yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
if git log --after="$yesterday" --before="$today" | grep commit ; then if git log --after="$yesterday" --before="$today" | grep commit ; then
echo "::set-output name=run_tests::true" echo run_tests=true >> $GITHUB_OUTPUT
else else
echo "::set-output name=run_tests::false" echo run_tests=false >> $GITHUB_OUTPUT
fi fi
- name: Trigger buildkite build - name: Trigger buildkite build

View File

@ -17,8 +17,8 @@ jobs:
run: | run: |
echo "$GITHUB_CONTEXT" echo "$GITHUB_CONTEXT"
- uses: actions/checkout@v1 - uses: actions/checkout@v3
- uses: actions/setup-python@v1 - uses: actions/setup-python@v4
- name: Install Bernadette app dependency and send an alert - name: Install Bernadette app dependency and send an alert
env: env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

View File

@ -87,13 +87,13 @@ jobs:
# python.version: "3.10" # python.version: "3.10"
Python311Linux: Python311Linux:
imageName: 'ubuntu-latest' imageName: 'ubuntu-latest'
python.version: '3.11.0-rc.2' python.version: '3.11.0'
Python311Windows: Python311Windows:
imageName: 'windows-latest' imageName: 'windows-latest'
python.version: '3.11.0-rc.2' python.version: '3.11.0'
Python311Mac: Python311Mac:
imageName: 'macos-latest' imageName: 'macos-latest'
python.version: '3.11.0-rc.2' python.version: '3.11.0'
maxParallel: 4 maxParallel: 4
pool: pool:
vmImage: $(imageName) vmImage: $(imageName)
@ -101,20 +101,3 @@ jobs:
- template: .github/azure-steps.yml - template: .github/azure-steps.yml
parameters: parameters:
python_version: '$(python.version)' python_version: '$(python.version)'
architecture: 'x64'
# - job: "TestGPU"
# dependsOn: "Validate"
# strategy:
# matrix:
# Python38LinuxX64_GPU:
# python.version: '3.8'
# pool:
# name: "LinuxX64_GPU"
# steps:
# - template: .github/azure-steps.yml
# parameters:
# python_version: '$(python.version)'
# architecture: 'x64'
# gpu: true
# num_build_jobs: 24

View File

@ -72,7 +72,7 @@ subword_features = true
"textcat", "textcat",
assigns=["doc.cats"], assigns=["doc.cats"],
default_config={ default_config={
"threshold": 0.5, "threshold": 0.0,
"model": DEFAULT_SINGLE_TEXTCAT_MODEL, "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_scorer.v1"}, "scorer": {"@scorers": "spacy.textcat_scorer.v1"},
}, },
@ -144,7 +144,8 @@ class TextCategorizer(TrainablePipe):
model (thinc.api.Model): The Thinc Model powering the pipeline component. model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the name (str): The component instance name, used to add entries to the
losses during training. losses during training.
threshold (float): Cutoff to consider a prediction "positive". threshold (float): Unused, not needed for single-label (exclusive
classes) classification.
scorer (Optional[Callable]): The scoring method. Defaults to scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_cats for the attribute "cats". Scorer.score_cats for the attribute "cats".
@ -154,7 +155,11 @@ class TextCategorizer(TrainablePipe):
self.model = model self.model = model
self.name = name self.name = name
self._rehearsal_model = None self._rehearsal_model = None
cfg = {"labels": [], "threshold": threshold, "positive_label": None} cfg: Dict[str, Any] = {
"labels": [],
"threshold": threshold,
"positive_label": None,
}
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.scorer = scorer self.scorer = scorer

View File

@ -446,7 +446,7 @@ class Scorer:
labels (Iterable[str]): The set of possible labels. Defaults to []. labels (Iterable[str]): The set of possible labels. Defaults to [].
multi_label (bool): Whether the attribute allows multiple labels. multi_label (bool): Whether the attribute allows multiple labels.
Defaults to True. When set to False (exclusive labels), missing Defaults to True. When set to False (exclusive labels), missing
gold labels are interpreted as 0.0. gold labels are interpreted as 0.0 and the threshold is set to 0.0.
positive_label (str): The positive label for a binary task with positive_label (str): The positive label for a binary task with
exclusive classes. Defaults to None. exclusive classes. Defaults to None.
threshold (float): Cutoff to consider a prediction "positive". Defaults threshold (float): Cutoff to consider a prediction "positive". Defaults
@ -471,6 +471,8 @@ class Scorer:
""" """
if threshold is None: if threshold is None:
threshold = 0.5 if multi_label else 0.0 threshold = 0.5 if multi_label else 0.0
if not multi_label:
threshold = 0.0
f_per_type = {label: PRFScore() for label in labels} f_per_type = {label: PRFScore() for label in labels}
auc_per_type = {label: ROCAUCScore() for label in labels} auc_per_type = {label: ROCAUCScore() for label in labels}
labels = set(labels) labels = set(labels)
@ -505,20 +507,18 @@ class Scorer:
# Get the highest-scoring for each. # Get the highest-scoring for each.
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1]) pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1]) gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
if pred_label == gold_label and pred_score >= threshold: if pred_label == gold_label:
f_per_type[pred_label].tp += 1 f_per_type[pred_label].tp += 1
else: else:
f_per_type[gold_label].fn += 1 f_per_type[gold_label].fn += 1
if pred_score >= threshold: f_per_type[pred_label].fp += 1
f_per_type[pred_label].fp += 1
elif gold_cats: elif gold_cats:
gold_label, gold_score = max(gold_cats, key=lambda it: it[1]) gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
if gold_score > 0: if gold_score > 0:
f_per_type[gold_label].fn += 1 f_per_type[gold_label].fn += 1
elif pred_cats: elif pred_cats:
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1]) pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
if pred_score >= threshold: f_per_type[pred_label].fp += 1
f_per_type[pred_label].fp += 1
micro_prf = PRFScore() micro_prf = PRFScore()
for label_prf in f_per_type.values(): for label_prf in f_per_type.values():
micro_prf.tp += label_prf.tp micro_prf.tp += label_prf.tp

View File

@ -370,3 +370,12 @@ def test_json_to_doc_validation_error(doc):
doc_json.pop("tokens") doc_json.pop("tokens")
with pytest.raises(ValueError): with pytest.raises(ValueError):
Doc(doc.vocab).from_json(doc_json, validate=True) Doc(doc.vocab).from_json(doc_json, validate=True)
def test_to_json_underscore_doc_getters(doc):
def get_text_length(doc):
return len(doc.text)
Doc.set_extension("text_length", getter=get_text_length)
doc_json = doc.to_json(underscore=["text_length"])
assert doc_json["_"]["text_length"] == get_text_length(doc)

View File

@ -823,10 +823,10 @@ def test_textcat_loss(multi_label: bool, expected_loss: float):
assert loss == expected_loss assert loss == expected_loss
def test_textcat_threshold(): def test_textcat_multilabel_threshold():
# Ensure the scorer can be called with a different threshold # Ensure the scorer can be called with a different threshold
nlp = English() nlp = English()
nlp.add_pipe("textcat") nlp.add_pipe("textcat_multilabel")
train_examples = [] train_examples = []
for text, annotations in TRAIN_DATA_SINGLE_LABEL: for text, annotations in TRAIN_DATA_SINGLE_LABEL:
@ -849,7 +849,7 @@ def test_textcat_threshold():
) )
pos_f = scores["cats_score"] pos_f = scores["cats_score"]
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0 assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
assert pos_f > macro_f assert pos_f >= macro_f
def test_textcat_multi_threshold(): def test_textcat_multi_threshold():

View File

@ -474,3 +474,50 @@ def test_prf_score():
assert (a.precision, a.recall, a.fscore) == approx( assert (a.precision, a.recall, a.fscore) == approx(
(c.precision, c.recall, c.fscore) (c.precision, c.recall, c.fscore)
) )
def test_score_cats(en_tokenizer):
text = "some text"
gold_doc = en_tokenizer(text)
gold_doc.cats = {"POSITIVE": 1.0, "NEGATIVE": 0.0}
pred_doc = en_tokenizer(text)
pred_doc.cats = {"POSITIVE": 0.75, "NEGATIVE": 0.25}
example = Example(pred_doc, gold_doc)
# threshold is ignored for multi_label=False
scores1 = Scorer.score_cats(
[example],
"cats",
labels=list(gold_doc.cats.keys()),
multi_label=False,
positive_label="POSITIVE",
threshold=0.1,
)
scores2 = Scorer.score_cats(
[example],
"cats",
labels=list(gold_doc.cats.keys()),
multi_label=False,
positive_label="POSITIVE",
threshold=0.9,
)
assert scores1["cats_score"] == 1.0
assert scores2["cats_score"] == 1.0
assert scores1 == scores2
# threshold is relevant for multi_label=True
scores = Scorer.score_cats(
[example],
"cats",
labels=list(gold_doc.cats.keys()),
multi_label=True,
threshold=0.9,
)
assert scores["cats_macro_f"] == 0.0
# threshold is relevant for multi_label=True
scores = Scorer.score_cats(
[example],
"cats",
labels=list(gold_doc.cats.keys()),
multi_label=True,
threshold=0.1,
)
assert scores["cats_macro_f"] == 0.5

View File

@ -1668,6 +1668,20 @@ cdef class Doc:
if underscore: if underscore:
user_keys = set() user_keys = set()
# Handle doc attributes with .get to include values from getters
# and not only values stored in user_data, for backwards
# compatibility
for attr in underscore:
if self.has_extension(attr):
if "_" not in data:
data["_"] = {}
value = self._.get(attr)
if not srsly.is_json_serializable(value):
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
data["_"][attr] = value
user_keys.add(attr)
# Token and span attributes only include values stored in user_data
# and not values generated by getters
if self.user_data: if self.user_data:
for data_key, value in self.user_data.copy().items(): for data_key, value in self.user_data.copy().items():
if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.": if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
@ -1678,20 +1692,15 @@ cdef class Doc:
user_keys.add(attr) user_keys.add(attr)
if not srsly.is_json_serializable(value): if not srsly.is_json_serializable(value):
raise ValueError(Errors.E107.format(attr=attr, value=repr(value))) raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
# Check if doc attribute # Token attribute
if start is None: if start is not None and end is None:
if "_" not in data:
data["_"] = {}
data["_"][attr] = value
# Check if token attribute
elif end is None:
if "underscore_token" not in data: if "underscore_token" not in data:
data["underscore_token"] = {} data["underscore_token"] = {}
if attr not in data["underscore_token"]: if attr not in data["underscore_token"]:
data["underscore_token"][attr] = [] data["underscore_token"][attr] = []
data["underscore_token"][attr].append({"start": start, "value": value}) data["underscore_token"][attr].append({"start": start, "value": value})
# Else span attribute # Span attribute
else: elif start is not None and end is not None:
if "underscore_span" not in data: if "underscore_span" not in data:
data["underscore_span"] = {} data["underscore_span"] = {}
if attr not in data["underscore_span"]: if attr not in data["underscore_span"]:

View File

@ -443,9 +443,9 @@ def load_model_from_package(
name: str, name: str,
*, *,
vocab: Union["Vocab", bool] = True, vocab: Union["Vocab", bool] = True,
disable: Union[str, Iterable[str]] = SimpleFrozenList(), disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
enable: Union[str, Iterable[str]] = SimpleFrozenList(), enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
exclude: Union[str, Iterable[str]] = SimpleFrozenList(), exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
"""Load a model from an installed package. """Load a model from an installed package.
@ -619,9 +619,9 @@ def load_model_from_init_py(
init_file: Union[Path, str], init_file: Union[Path, str],
*, *,
vocab: Union["Vocab", bool] = True, vocab: Union["Vocab", bool] = True,
disable: Union[str, Iterable[str]] = SimpleFrozenList(), disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
enable: Union[str, Iterable[str]] = SimpleFrozenList(), enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
exclude: Union[str, Iterable[str]] = SimpleFrozenList(), exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
"""Helper function to use in the `load()` method of a model package's """Helper function to use in the `load()` method of a model package's

View File

@ -229,16 +229,17 @@ The reported `{attr}_score` depends on the classification properties:
> print(scores["cats_macro_auc"]) > print(scores["cats_macro_auc"])
> ``` > ```
| Name | Description | | Name | Description |
| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | The attribute to score. ~~str~~ | | `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~ | | `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~ |
| labels | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~ | | labels | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~ |
| `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~ | | `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. When set to `False` (exclusive labels), missing gold labels are interpreted as `0.0` and the threshold is set to `0.0`. ~~bool~~ |
| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ | | `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ |
| **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ | | `threshold` | Cutoff to consider a prediction "positive". Defaults to `0.5` for multi-label, and `0.0` (i.e. whatever's highest scoring) otherwise. ~~float~~ |
| **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ |
## Scorer.score_links {#score_links tag="staticmethod" new="3"} ## Scorer.score_links {#score_links tag="staticmethod" new="3"}

View File

@ -63,7 +63,6 @@ architectures and their arguments and hyperparameters.
> ```python > ```python
> from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL > from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
> config = { > config = {
> "threshold": 0.5,
> "model": DEFAULT_SINGLE_TEXTCAT_MODEL, > "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
> } > }
> nlp.add_pipe("textcat", config=config) > nlp.add_pipe("textcat", config=config)
@ -82,7 +81,7 @@ architectures and their arguments and hyperparameters.
| Setting | Description | | Setting | Description |
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | | `threshold` | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~ |
| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ | | `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ | | `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
@ -123,7 +122,7 @@ shortcut for this and instantiate the component using its string name and
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | | `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | | `threshold` | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~ |
| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ | | `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
## TextCategorizer.\_\_call\_\_ {#call tag="method"} ## TextCategorizer.\_\_call\_\_ {#call tag="method"}

View File

@ -1792,7 +1792,7 @@ the entity `Span` for example `._.orgs` or `._.prev_orgs` and
> [`Doc.retokenize`](/api/doc#retokenize) context manager: > [`Doc.retokenize`](/api/doc#retokenize) context manager:
> >
> ```python > ```python
> with doc.retokenize() as retokenize: > with doc.retokenize() as retokenizer:
> for ent in doc.ents: > for ent in doc.ents:
> retokenizer.merge(ent) > retokenizer.merge(ent)
> ``` > ```