mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge branch 'explosion:master' into feature/candidate-generation-by-docs
This commit is contained in:
		
						commit
						7a4ef51807
					
				
							
								
								
									
										75
									
								
								.github/azure-steps.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										75
									
								
								.github/azure-steps.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
				
			
			@ -1,9 +1,7 @@
 | 
			
		|||
parameters:
 | 
			
		||||
  python_version: ''
 | 
			
		||||
  architecture: ''
 | 
			
		||||
  prefix: ''
 | 
			
		||||
  gpu: false
 | 
			
		||||
  num_build_jobs: 1
 | 
			
		||||
  architecture: 'x64'
 | 
			
		||||
  num_build_jobs: 2
 | 
			
		||||
 | 
			
		||||
steps:
 | 
			
		||||
  - task: UsePythonVersion@0
 | 
			
		||||
| 
						 | 
				
			
			@ -17,16 +15,16 @@ steps:
 | 
			
		|||
    displayName: 'Set variables'
 | 
			
		||||
 | 
			
		||||
  - script: |
 | 
			
		||||
      ${{ parameters.prefix }} python -m pip install -U pip setuptools
 | 
			
		||||
      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
 | 
			
		||||
      python -m pip install -U build pip setuptools
 | 
			
		||||
      python -m pip install -U -r requirements.txt
 | 
			
		||||
    displayName: "Install dependencies"
 | 
			
		||||
 | 
			
		||||
  - script: |
 | 
			
		||||
      ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
 | 
			
		||||
      ${{ parameters.prefix }} python setup.py sdist --formats=gztar
 | 
			
		||||
    displayName: "Compile and build sdist"
 | 
			
		||||
      python -m build --sdist
 | 
			
		||||
    displayName: "Build sdist"
 | 
			
		||||
 | 
			
		||||
  - script: python -m mypy spacy
 | 
			
		||||
  - script: |
 | 
			
		||||
      python -m mypy spacy
 | 
			
		||||
    displayName: 'Run mypy'
 | 
			
		||||
    condition: ne(variables['python_version'], '3.6')
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -35,35 +33,24 @@ steps:
 | 
			
		|||
      contents: "spacy"
 | 
			
		||||
    displayName: "Delete source directory"
 | 
			
		||||
 | 
			
		||||
  - task: DeleteFiles@1
 | 
			
		||||
    inputs:
 | 
			
		||||
      contents: "*.egg-info"
 | 
			
		||||
    displayName: "Delete egg-info directory"
 | 
			
		||||
 | 
			
		||||
  - script: |
 | 
			
		||||
      ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
 | 
			
		||||
      ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
 | 
			
		||||
      python -m pip freeze > installed.txt
 | 
			
		||||
      python -m pip uninstall -y -r installed.txt
 | 
			
		||||
    displayName: "Uninstall all packages"
 | 
			
		||||
 | 
			
		||||
  - bash: |
 | 
			
		||||
      ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
 | 
			
		||||
      ${{ parameters.prefix }} SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
 | 
			
		||||
      SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
 | 
			
		||||
      SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
 | 
			
		||||
    displayName: "Install from sdist"
 | 
			
		||||
 | 
			
		||||
  - script: |
 | 
			
		||||
      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
 | 
			
		||||
    displayName: "Install test requirements"
 | 
			
		||||
 | 
			
		||||
  - script: |
 | 
			
		||||
      ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
 | 
			
		||||
      ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
 | 
			
		||||
    displayName: "Install GPU requirements"
 | 
			
		||||
    condition: eq(${{ parameters.gpu }}, true)
 | 
			
		||||
 | 
			
		||||
  - script: |
 | 
			
		||||
      ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error
 | 
			
		||||
    displayName: "Run CPU tests"
 | 
			
		||||
    condition: eq(${{ parameters.gpu }}, false)
 | 
			
		||||
 | 
			
		||||
  - script: |
 | 
			
		||||
      ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu
 | 
			
		||||
    displayName: "Run GPU tests"
 | 
			
		||||
    condition: eq(${{ parameters.gpu }}, true)
 | 
			
		||||
      python -W error -c "import spacy"
 | 
			
		||||
    displayName: "Test import"
 | 
			
		||||
 | 
			
		||||
  - script: |
 | 
			
		||||
      python -m spacy download ca_core_news_sm
 | 
			
		||||
| 
						 | 
				
			
			@ -72,6 +59,11 @@ steps:
 | 
			
		|||
    displayName: 'Test download CLI'
 | 
			
		||||
    condition: eq(variables['python_version'], '3.8')
 | 
			
		||||
 | 
			
		||||
  - script: |
 | 
			
		||||
      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
 | 
			
		||||
    displayName: 'Test no warnings on load (#11713)'
 | 
			
		||||
    condition: eq(variables['python_version'], '3.8')
 | 
			
		||||
 | 
			
		||||
  - script: |
 | 
			
		||||
      python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
 | 
			
		||||
    displayName: 'Test convert CLI'
 | 
			
		||||
| 
						 | 
				
			
			@ -106,13 +98,22 @@ steps:
 | 
			
		|||
    displayName: 'Test assemble CLI vectors warning'
 | 
			
		||||
    condition: eq(variables['python_version'], '3.8')
 | 
			
		||||
 | 
			
		||||
  - script: |
 | 
			
		||||
      python -m pip install -U -r requirements.txt
 | 
			
		||||
    displayName: "Install test requirements"
 | 
			
		||||
 | 
			
		||||
  - script: |
 | 
			
		||||
      python -m pytest --pyargs spacy -W error
 | 
			
		||||
    displayName: "Run CPU tests"
 | 
			
		||||
 | 
			
		||||
  - script: |
 | 
			
		||||
      python -m pip install --pre thinc-apple-ops
 | 
			
		||||
      python -m pytest --pyargs spacy
 | 
			
		||||
    displayName: "Run CPU tests with thinc-apple-ops"
 | 
			
		||||
    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
 | 
			
		||||
 | 
			
		||||
  - script: |
 | 
			
		||||
      python .github/validate_universe_json.py website/meta/universe.json
 | 
			
		||||
    displayName: 'Test website/meta/universe.json'
 | 
			
		||||
    condition: eq(variables['python_version'], '3.8')
 | 
			
		||||
 | 
			
		||||
  - script: |
 | 
			
		||||
      ${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops
 | 
			
		||||
      ${{ parameters.prefix }} python -m pytest --pyargs spacy
 | 
			
		||||
    displayName: "Run CPU tests with thinc-apple-ops"
 | 
			
		||||
    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										2
									
								
								.github/workflows/autoblack.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/autoblack.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
				
			
			@ -15,7 +15,7 @@ jobs:
 | 
			
		|||
      - uses: actions/checkout@v3
 | 
			
		||||
        with:
 | 
			
		||||
            ref: ${{ github.head_ref }}
 | 
			
		||||
      - uses: actions/setup-python@v3
 | 
			
		||||
      - uses: actions/setup-python@v4
 | 
			
		||||
      - run: pip install black
 | 
			
		||||
      - name: Auto-format code if needed
 | 
			
		||||
        run: black spacy
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										6
									
								
								.github/workflows/explosionbot.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										6
									
								
								.github/workflows/explosionbot.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
				
			
			@ -8,14 +8,14 @@ on:
 | 
			
		|||
 | 
			
		||||
jobs:
 | 
			
		||||
  explosion-bot:
 | 
			
		||||
    runs-on: ubuntu-18.04
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    steps:
 | 
			
		||||
      - name: Dump GitHub context
 | 
			
		||||
        env:
 | 
			
		||||
          GITHUB_CONTEXT: ${{ toJson(github) }}
 | 
			
		||||
        run: echo "$GITHUB_CONTEXT"
 | 
			
		||||
      - uses: actions/checkout@v1
 | 
			
		||||
      - uses: actions/setup-python@v1
 | 
			
		||||
      - uses: actions/checkout@v3
 | 
			
		||||
      - uses: actions/setup-python@v4
 | 
			
		||||
      - name: Install and run explosion-bot
 | 
			
		||||
        run: |
 | 
			
		||||
          pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										6
									
								
								.github/workflows/slowtests.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										6
									
								
								.github/workflows/slowtests.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
				
			
			@ -14,7 +14,7 @@ jobs:
 | 
			
		|||
    runs-on: ubuntu-latest
 | 
			
		||||
    steps:
 | 
			
		||||
      - name: Checkout
 | 
			
		||||
        uses: actions/checkout@v1
 | 
			
		||||
        uses: actions/checkout@v3
 | 
			
		||||
        with:
 | 
			
		||||
          ref: ${{ matrix.branch }}
 | 
			
		||||
      - name: Get commits from past 24 hours
 | 
			
		||||
| 
						 | 
				
			
			@ -23,9 +23,9 @@ jobs:
 | 
			
		|||
          today=$(date '+%Y-%m-%d %H:%M:%S')
 | 
			
		||||
          yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
 | 
			
		||||
          if git log --after="$yesterday" --before="$today" | grep commit ; then
 | 
			
		||||
            echo "::set-output name=run_tests::true"
 | 
			
		||||
            echo run_tests=true >> $GITHUB_OUTPUT
 | 
			
		||||
          else
 | 
			
		||||
            echo "::set-output name=run_tests::false"
 | 
			
		||||
            echo run_tests=false >> $GITHUB_OUTPUT
 | 
			
		||||
          fi
 | 
			
		||||
 | 
			
		||||
      - name: Trigger buildkite build
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										4
									
								
								.github/workflows/spacy_universe_alert.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/workflows/spacy_universe_alert.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
				
			
			@ -17,8 +17,8 @@ jobs:
 | 
			
		|||
        run: |
 | 
			
		||||
          echo "$GITHUB_CONTEXT"
 | 
			
		||||
 | 
			
		||||
      - uses: actions/checkout@v1
 | 
			
		||||
      - uses: actions/setup-python@v1
 | 
			
		||||
      - uses: actions/checkout@v3
 | 
			
		||||
      - uses: actions/setup-python@v4
 | 
			
		||||
      - name: Install Bernadette app dependency and send an alert
 | 
			
		||||
        env:
 | 
			
		||||
          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -87,13 +87,13 @@ jobs:
 | 
			
		|||
        #          python.version: "3.10"
 | 
			
		||||
        Python311Linux:
 | 
			
		||||
          imageName: 'ubuntu-latest'
 | 
			
		||||
          python.version: '3.11.0-rc.2'
 | 
			
		||||
          python.version: '3.11.0'
 | 
			
		||||
        Python311Windows:
 | 
			
		||||
          imageName: 'windows-latest'
 | 
			
		||||
          python.version: '3.11.0-rc.2'
 | 
			
		||||
          python.version: '3.11.0'
 | 
			
		||||
        Python311Mac:
 | 
			
		||||
          imageName: 'macos-latest'
 | 
			
		||||
          python.version: '3.11.0-rc.2'
 | 
			
		||||
          python.version: '3.11.0'
 | 
			
		||||
      maxParallel: 4
 | 
			
		||||
    pool:
 | 
			
		||||
      vmImage: $(imageName)
 | 
			
		||||
| 
						 | 
				
			
			@ -101,20 +101,3 @@ jobs:
 | 
			
		|||
      - template: .github/azure-steps.yml
 | 
			
		||||
        parameters:
 | 
			
		||||
          python_version: '$(python.version)'
 | 
			
		||||
          architecture: 'x64'
 | 
			
		||||
 | 
			
		||||
#  - job: "TestGPU"
 | 
			
		||||
#    dependsOn: "Validate"
 | 
			
		||||
#    strategy:
 | 
			
		||||
#      matrix:
 | 
			
		||||
#        Python38LinuxX64_GPU:
 | 
			
		||||
#          python.version: '3.8'
 | 
			
		||||
#    pool:
 | 
			
		||||
#      name: "LinuxX64_GPU"
 | 
			
		||||
#    steps:
 | 
			
		||||
#      - template: .github/azure-steps.yml
 | 
			
		||||
#        parameters:
 | 
			
		||||
#          python_version: '$(python.version)'
 | 
			
		||||
#          architecture: 'x64'
 | 
			
		||||
#          gpu: true
 | 
			
		||||
#          num_build_jobs: 24
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -72,7 +72,7 @@ subword_features = true
 | 
			
		|||
    "textcat",
 | 
			
		||||
    assigns=["doc.cats"],
 | 
			
		||||
    default_config={
 | 
			
		||||
        "threshold": 0.5,
 | 
			
		||||
        "threshold": 0.0,
 | 
			
		||||
        "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
 | 
			
		||||
        "scorer": {"@scorers": "spacy.textcat_scorer.v1"},
 | 
			
		||||
    },
 | 
			
		||||
| 
						 | 
				
			
			@ -144,7 +144,8 @@ class TextCategorizer(TrainablePipe):
 | 
			
		|||
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
 | 
			
		||||
        name (str): The component instance name, used to add entries to the
 | 
			
		||||
            losses during training.
 | 
			
		||||
        threshold (float): Cutoff to consider a prediction "positive".
 | 
			
		||||
        threshold (float): Unused, not needed for single-label (exclusive
 | 
			
		||||
            classes) classification.
 | 
			
		||||
        scorer (Optional[Callable]): The scoring method. Defaults to
 | 
			
		||||
                Scorer.score_cats for the attribute "cats".
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -154,7 +155,11 @@ class TextCategorizer(TrainablePipe):
 | 
			
		|||
        self.model = model
 | 
			
		||||
        self.name = name
 | 
			
		||||
        self._rehearsal_model = None
 | 
			
		||||
        cfg = {"labels": [], "threshold": threshold, "positive_label": None}
 | 
			
		||||
        cfg: Dict[str, Any] = {
 | 
			
		||||
            "labels": [],
 | 
			
		||||
            "threshold": threshold,
 | 
			
		||||
            "positive_label": None,
 | 
			
		||||
        }
 | 
			
		||||
        self.cfg = dict(cfg)
 | 
			
		||||
        self.scorer = scorer
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -446,7 +446,7 @@ class Scorer:
 | 
			
		|||
        labels (Iterable[str]): The set of possible labels. Defaults to [].
 | 
			
		||||
        multi_label (bool): Whether the attribute allows multiple labels.
 | 
			
		||||
            Defaults to True. When set to False (exclusive labels), missing
 | 
			
		||||
            gold labels are interpreted as 0.0.
 | 
			
		||||
            gold labels are interpreted as 0.0 and the threshold is set to 0.0.
 | 
			
		||||
        positive_label (str): The positive label for a binary task with
 | 
			
		||||
            exclusive classes. Defaults to None.
 | 
			
		||||
        threshold (float): Cutoff to consider a prediction "positive". Defaults
 | 
			
		||||
| 
						 | 
				
			
			@ -471,6 +471,8 @@ class Scorer:
 | 
			
		|||
        """
 | 
			
		||||
        if threshold is None:
 | 
			
		||||
            threshold = 0.5 if multi_label else 0.0
 | 
			
		||||
        if not multi_label:
 | 
			
		||||
            threshold = 0.0
 | 
			
		||||
        f_per_type = {label: PRFScore() for label in labels}
 | 
			
		||||
        auc_per_type = {label: ROCAUCScore() for label in labels}
 | 
			
		||||
        labels = set(labels)
 | 
			
		||||
| 
						 | 
				
			
			@ -505,20 +507,18 @@ class Scorer:
 | 
			
		|||
                # Get the highest-scoring for each.
 | 
			
		||||
                pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
 | 
			
		||||
                gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
 | 
			
		||||
                if pred_label == gold_label and pred_score >= threshold:
 | 
			
		||||
                if pred_label == gold_label:
 | 
			
		||||
                    f_per_type[pred_label].tp += 1
 | 
			
		||||
                else:
 | 
			
		||||
                    f_per_type[gold_label].fn += 1
 | 
			
		||||
                    if pred_score >= threshold:
 | 
			
		||||
                        f_per_type[pred_label].fp += 1
 | 
			
		||||
                    f_per_type[pred_label].fp += 1
 | 
			
		||||
            elif gold_cats:
 | 
			
		||||
                gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
 | 
			
		||||
                if gold_score > 0:
 | 
			
		||||
                    f_per_type[gold_label].fn += 1
 | 
			
		||||
            elif pred_cats:
 | 
			
		||||
                pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
 | 
			
		||||
                if pred_score >= threshold:
 | 
			
		||||
                    f_per_type[pred_label].fp += 1
 | 
			
		||||
                f_per_type[pred_label].fp += 1
 | 
			
		||||
        micro_prf = PRFScore()
 | 
			
		||||
        for label_prf in f_per_type.values():
 | 
			
		||||
            micro_prf.tp += label_prf.tp
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -370,3 +370,12 @@ def test_json_to_doc_validation_error(doc):
 | 
			
		|||
    doc_json.pop("tokens")
 | 
			
		||||
    with pytest.raises(ValueError):
 | 
			
		||||
        Doc(doc.vocab).from_json(doc_json, validate=True)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_to_json_underscore_doc_getters(doc):
 | 
			
		||||
    def get_text_length(doc):
 | 
			
		||||
        return len(doc.text)
 | 
			
		||||
 | 
			
		||||
    Doc.set_extension("text_length", getter=get_text_length)
 | 
			
		||||
    doc_json = doc.to_json(underscore=["text_length"])
 | 
			
		||||
    assert doc_json["_"]["text_length"] == get_text_length(doc)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -823,10 +823,10 @@ def test_textcat_loss(multi_label: bool, expected_loss: float):
 | 
			
		|||
    assert loss == expected_loss
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_textcat_threshold():
 | 
			
		||||
def test_textcat_multilabel_threshold():
 | 
			
		||||
    # Ensure the scorer can be called with a different threshold
 | 
			
		||||
    nlp = English()
 | 
			
		||||
    nlp.add_pipe("textcat")
 | 
			
		||||
    nlp.add_pipe("textcat_multilabel")
 | 
			
		||||
 | 
			
		||||
    train_examples = []
 | 
			
		||||
    for text, annotations in TRAIN_DATA_SINGLE_LABEL:
 | 
			
		||||
| 
						 | 
				
			
			@ -849,7 +849,7 @@ def test_textcat_threshold():
 | 
			
		|||
    )
 | 
			
		||||
    pos_f = scores["cats_score"]
 | 
			
		||||
    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
 | 
			
		||||
    assert pos_f > macro_f
 | 
			
		||||
    assert pos_f >= macro_f
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_textcat_multi_threshold():
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -474,3 +474,50 @@ def test_prf_score():
 | 
			
		|||
    assert (a.precision, a.recall, a.fscore) == approx(
 | 
			
		||||
        (c.precision, c.recall, c.fscore)
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_score_cats(en_tokenizer):
 | 
			
		||||
    text = "some text"
 | 
			
		||||
    gold_doc = en_tokenizer(text)
 | 
			
		||||
    gold_doc.cats = {"POSITIVE": 1.0, "NEGATIVE": 0.0}
 | 
			
		||||
    pred_doc = en_tokenizer(text)
 | 
			
		||||
    pred_doc.cats = {"POSITIVE": 0.75, "NEGATIVE": 0.25}
 | 
			
		||||
    example = Example(pred_doc, gold_doc)
 | 
			
		||||
    # threshold is ignored for multi_label=False
 | 
			
		||||
    scores1 = Scorer.score_cats(
 | 
			
		||||
        [example],
 | 
			
		||||
        "cats",
 | 
			
		||||
        labels=list(gold_doc.cats.keys()),
 | 
			
		||||
        multi_label=False,
 | 
			
		||||
        positive_label="POSITIVE",
 | 
			
		||||
        threshold=0.1,
 | 
			
		||||
    )
 | 
			
		||||
    scores2 = Scorer.score_cats(
 | 
			
		||||
        [example],
 | 
			
		||||
        "cats",
 | 
			
		||||
        labels=list(gold_doc.cats.keys()),
 | 
			
		||||
        multi_label=False,
 | 
			
		||||
        positive_label="POSITIVE",
 | 
			
		||||
        threshold=0.9,
 | 
			
		||||
    )
 | 
			
		||||
    assert scores1["cats_score"] == 1.0
 | 
			
		||||
    assert scores2["cats_score"] == 1.0
 | 
			
		||||
    assert scores1 == scores2
 | 
			
		||||
    # threshold is relevant for multi_label=True
 | 
			
		||||
    scores = Scorer.score_cats(
 | 
			
		||||
        [example],
 | 
			
		||||
        "cats",
 | 
			
		||||
        labels=list(gold_doc.cats.keys()),
 | 
			
		||||
        multi_label=True,
 | 
			
		||||
        threshold=0.9,
 | 
			
		||||
    )
 | 
			
		||||
    assert scores["cats_macro_f"] == 0.0
 | 
			
		||||
    # threshold is relevant for multi_label=True
 | 
			
		||||
    scores = Scorer.score_cats(
 | 
			
		||||
        [example],
 | 
			
		||||
        "cats",
 | 
			
		||||
        labels=list(gold_doc.cats.keys()),
 | 
			
		||||
        multi_label=True,
 | 
			
		||||
        threshold=0.1,
 | 
			
		||||
    )
 | 
			
		||||
    assert scores["cats_macro_f"] == 0.5
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1668,6 +1668,20 @@ cdef class Doc:
 | 
			
		|||
 | 
			
		||||
        if underscore:
 | 
			
		||||
            user_keys = set()
 | 
			
		||||
            # Handle doc attributes with .get to include values from getters
 | 
			
		||||
            # and not only values stored in user_data, for backwards
 | 
			
		||||
            # compatibility
 | 
			
		||||
            for attr in underscore:
 | 
			
		||||
                if self.has_extension(attr):
 | 
			
		||||
                    if "_" not in data:
 | 
			
		||||
                        data["_"] = {}
 | 
			
		||||
                    value = self._.get(attr)
 | 
			
		||||
                    if not srsly.is_json_serializable(value):
 | 
			
		||||
                        raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
 | 
			
		||||
                    data["_"][attr] = value
 | 
			
		||||
                    user_keys.add(attr)
 | 
			
		||||
            # Token and span attributes only include values stored in user_data
 | 
			
		||||
            # and not values generated by getters
 | 
			
		||||
            if self.user_data:
 | 
			
		||||
                for data_key, value in self.user_data.copy().items():
 | 
			
		||||
                    if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
 | 
			
		||||
| 
						 | 
				
			
			@ -1678,20 +1692,15 @@ cdef class Doc:
 | 
			
		|||
                            user_keys.add(attr)
 | 
			
		||||
                            if not srsly.is_json_serializable(value):
 | 
			
		||||
                                raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
 | 
			
		||||
                            # Check if doc attribute
 | 
			
		||||
                            if start is None:
 | 
			
		||||
                                if "_" not in data:
 | 
			
		||||
                                    data["_"] = {}
 | 
			
		||||
                                data["_"][attr] = value
 | 
			
		||||
                            # Check if token attribute
 | 
			
		||||
                            elif end is None:
 | 
			
		||||
                            # Token attribute
 | 
			
		||||
                            if start is not None and end is None:
 | 
			
		||||
                                if "underscore_token" not in data:
 | 
			
		||||
                                    data["underscore_token"] = {}
 | 
			
		||||
                                if attr not in data["underscore_token"]:
 | 
			
		||||
                                    data["underscore_token"][attr] = []
 | 
			
		||||
                                data["underscore_token"][attr].append({"start": start, "value": value})
 | 
			
		||||
                            # Else span attribute
 | 
			
		||||
                            else:
 | 
			
		||||
                            # Span attribute
 | 
			
		||||
                            elif start is not None and end is not None:
 | 
			
		||||
                                if "underscore_span" not in data:
 | 
			
		||||
                                    data["underscore_span"] = {}
 | 
			
		||||
                                if attr not in data["underscore_span"]:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -443,9 +443,9 @@ def load_model_from_package(
 | 
			
		|||
    name: str,
 | 
			
		||||
    *,
 | 
			
		||||
    vocab: Union["Vocab", bool] = True,
 | 
			
		||||
    disable: Union[str, Iterable[str]] = SimpleFrozenList(),
 | 
			
		||||
    enable: Union[str, Iterable[str]] = SimpleFrozenList(),
 | 
			
		||||
    exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
 | 
			
		||||
    disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
 | 
			
		||||
    enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
 | 
			
		||||
    exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
 | 
			
		||||
    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
 | 
			
		||||
) -> "Language":
 | 
			
		||||
    """Load a model from an installed package.
 | 
			
		||||
| 
						 | 
				
			
			@ -619,9 +619,9 @@ def load_model_from_init_py(
 | 
			
		|||
    init_file: Union[Path, str],
 | 
			
		||||
    *,
 | 
			
		||||
    vocab: Union["Vocab", bool] = True,
 | 
			
		||||
    disable: Union[str, Iterable[str]] = SimpleFrozenList(),
 | 
			
		||||
    enable: Union[str, Iterable[str]] = SimpleFrozenList(),
 | 
			
		||||
    exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
 | 
			
		||||
    disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
 | 
			
		||||
    enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
 | 
			
		||||
    exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
 | 
			
		||||
    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
 | 
			
		||||
) -> "Language":
 | 
			
		||||
    """Helper function to use in the `load()` method of a model package's
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -229,16 +229,17 @@ The reported `{attr}_score` depends on the classification properties:
 | 
			
		|||
> print(scores["cats_macro_auc"])
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name             | Description                                                                                                                                        |
 | 
			
		||||
| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                |
 | 
			
		||||
| `attr`           | The attribute to score. ~~str~~                                                                                                                    |
 | 
			
		||||
| _keyword-only_   |                                                                                                                                                    |
 | 
			
		||||
| `getter`         | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~ |
 | 
			
		||||
| labels           | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~                                                                                    |
 | 
			
		||||
| `multi_label`    | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~                                                                         |
 | 
			
		||||
| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~                                                 |
 | 
			
		||||
| **RETURNS**      | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~                                             |
 | 
			
		||||
| Name             | Description                                                                                                                                                                                        |
 | 
			
		||||
| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                                                                |
 | 
			
		||||
| `attr`           | The attribute to score. ~~str~~                                                                                                                                                                    |
 | 
			
		||||
| _keyword-only_   |                                                                                                                                                                                                    |
 | 
			
		||||
| `getter`         | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~                                                 |
 | 
			
		||||
| labels           | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                    |
 | 
			
		||||
| `multi_label`    | Whether the attribute allows multiple labels. Defaults to `True`. When set to `False` (exclusive labels), missing gold labels are interpreted as `0.0` and the threshold is set to `0.0`. ~~bool~~ |
 | 
			
		||||
| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~                                                                                                 |
 | 
			
		||||
| `threshold`      | Cutoff to consider a prediction "positive". Defaults to `0.5` for multi-label, and `0.0` (i.e. whatever's highest scoring) otherwise. ~~float~~                                                    |
 | 
			
		||||
| **RETURNS**      | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~                                                                                             |
 | 
			
		||||
 | 
			
		||||
## Scorer.score_links {#score_links tag="staticmethod" new="3"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -63,7 +63,6 @@ architectures and their arguments and hyperparameters.
 | 
			
		|||
> ```python
 | 
			
		||||
> from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
 | 
			
		||||
> config = {
 | 
			
		||||
>    "threshold": 0.5,
 | 
			
		||||
>    "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
 | 
			
		||||
> }
 | 
			
		||||
> nlp.add_pipe("textcat", config=config)
 | 
			
		||||
| 
						 | 
				
			
			@ -82,7 +81,7 @@ architectures and their arguments and hyperparameters.
 | 
			
		|||
 | 
			
		||||
| Setting     | Description                                                                                                                                                      |
 | 
			
		||||
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                                                   |
 | 
			
		||||
| `threshold` | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~                                        |
 | 
			
		||||
| `model`     | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
 | 
			
		||||
| `scorer`    | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~                                 |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -123,7 +122,7 @@ shortcut for this and instantiate the component using its string name and
 | 
			
		|||
| `model`        | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~       |
 | 
			
		||||
| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                              |
 | 
			
		||||
| _keyword-only_ |                                                                                                                                  |
 | 
			
		||||
| `threshold`    | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                   |
 | 
			
		||||
| `threshold`    | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~        |
 | 
			
		||||
| `scorer`       | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
 | 
			
		||||
 | 
			
		||||
## TextCategorizer.\_\_call\_\_ {#call tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1792,7 +1792,7 @@ the entity `Span` – for example `._.orgs` or `._.prev_orgs` and
 | 
			
		|||
> [`Doc.retokenize`](/api/doc#retokenize) context manager:
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> with doc.retokenize() as retokenize:
 | 
			
		||||
> with doc.retokenize() as retokenizer:
 | 
			
		||||
>   for ent in doc.ents:
 | 
			
		||||
>       retokenizer.merge(ent)
 | 
			
		||||
> ```
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user