Merge pull request #9987 from explosion/master

Update develop with commits from master
2025-08-09 14:44:52 +03:00 · 2022-01-05 11:49:50 +01:00 · 2022-01-05 11:49:50 +01:00 · 067a44a417
commit 067a44a417
parent 86e71e7b19 00e7bf5ffd
10 changed files with 70 additions and 44 deletions
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@ -25,7 +25,7 @@ def debug_config_cli(
    show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
    # fmt: on
 ):
-    """Debug a config.cfg file and show validation errors. The command will
+    """Debug a config file and show validation errors. The command will
    create all objects in the tree and validate them. Note that some config
    validation errors are blocking and will prevent the rest of the config from
    being resolved. This means that you may not see all validation errors at
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -203,6 +203,7 @@ def debug_data(
        has_low_data_warning = False
        has_no_neg_warning = False
        has_ws_ents_error = False
+        has_boundary_cross_ents_warning = False

        msg.divider("Named Entity Recognition")
        msg.info(f"{len(model_labels)} label(s)")
@ -242,12 +243,20 @@ def debug_data(
                    msg.warn(f"No examples for texts WITHOUT new label '{label}'")
                    has_no_neg_warning = True

+        if gold_train_data["boundary_cross_ents"]:
+            msg.warn(
+                f"{gold_train_data['boundary_cross_ents']} entity span(s) crossing sentence boundaries"
+            )
+            has_boundary_cross_ents_warning = True
+
        if not has_low_data_warning:
            msg.good("Good amount of examples for all labels")
        if not has_no_neg_warning:
            msg.good("Examples without occurrences available for all labels")
        if not has_ws_ents_error:
            msg.good("No entities consisting of or starting/ending with whitespace")
+        if not has_boundary_cross_ents_warning:
+            msg.good("No entities crossing sentence boundaries")

        if has_low_data_warning:
            msg.text(
@ -565,6 +574,7 @@ def _compile_gold(
        "words": Counter(),
        "roots": Counter(),
        "ws_ents": 0,
+        "boundary_cross_ents": 0,
        "n_words": 0,
        "n_misaligned_words": 0,
        "words_missing_vectors": Counter(),
@ -602,6 +612,8 @@ def _compile_gold(
                if label.startswith(("B-", "U-")):
                    combined_label = label.split("-")[1]
                    data["ner"][combined_label] += 1
+                if gold[i].is_sent_start and label.startswith(("I-", "L-")):
+                    data["boundary_cross_ents"] += 1
                elif label == "-":
                    data["ner"]["-"] += 1
        if "textcat" in factory_names or "textcat_multilabel" in factory_names:
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@ -27,7 +27,7 @@ class Optimizations(str, Enum):
@init_cli.command("config")
 def init_config_cli(
    # fmt: off
-    output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
+    output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
    lang: str = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
    pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
    optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
@ -37,7 +37,7 @@ def init_config_cli(
    # fmt: on
 ):
    """
-    Generate a starter config.cfg for training. Based on your requirements
+    Generate a starter config file for training. Based on your requirements
    specified via the CLI arguments, this command generates a config with the
    optimal settings for your use case. This includes the choice of architecture,
    pretrained weights and related hyperparameters.
@ -66,15 +66,15 @@ def init_config_cli(
@init_cli.command("fill-config")
 def init_fill_config_cli(
    # fmt: off
-    base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
-    output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
+    base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False),
+    output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True),
    pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
    diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"),
    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    # fmt: on
 ):
    """
-    Fill partial config.cfg with default values. Will add all missing settings
+    Fill partial config file with default values. Will add all missing settings
    from the default config and will create all objects, check the registered
    functions for their default values and update the base config. This command
    can be used with a config generated via the training quickstart widget:
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -68,12 +68,14 @@ seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
-# Controls early-stopping. 0 disables early stopping.
+# Controls early-stopping, i.e., the number of steps to continue without
+# improvement before stopping. 0 disables early stopping.
 patience = 1600
 # Number of epochs. 0 means unlimited. If >= 0, train corpus is loaded once in
 # memory and shuffled within the training loop. -1 means stream train corpus
 # rather than loading in memory with no shuffling within the training loop.
 max_epochs = 0
+# Maximum number of update steps to train for. 0 means an unlimited number of steps.
 max_steps = 20000
 eval_frequency = 200
 # Control how scores are printed and checkpoints are evaluated.
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -642,7 +642,7 @@ class Errors(metaclass=ErrorsWithCodes):
    E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
            "for mode '{mode}'. Required tables: {tables}. Found: {found}.")
    E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
-            "config.cfg or override it on the CLI?")
+            ".cfg file or override it on the CLI?")
    E914 = ("Executing {name} callback failed. Expected the function to "
            "return the nlp object but got: {value}. Maybe you forgot to return "
            "the modified object in your function?")
--- a/spacy/lexeme.pyi
+++ b/spacy/lexeme.pyi
@ -19,7 +19,7 @@ class Lexeme:
    @property
    def vector_norm(self) -> float: ...
    vector: Floats1d
-    rank: str
+    rank: int
    sentiment: float
    @property
    def orth_(self) -> str: ...
--- a/spacy/util.py
+++ b/spacy/util.py
@ -63,7 +63,7 @@ OOV_RANK = numpy.iinfo(numpy.uint64).max
 DEFAULT_OOV_PROB = -20
 LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]

-# Default order of sections in the config.cfg. Not all sections needs to exist,
+# Default order of sections in the config file. Not all sections needs to exist,
 # and additional sections are added at the end, in alphabetical order.
 CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
 # fmt: on
@ -465,7 +465,7 @@ def load_model_from_path(
    """Load a model from a data directory path. Creates Language class with
    pipeline from config.cfg and then calls from_disk() with path.

-    model_path (Path): Mmodel path.
+    model_path (Path): Model path.
    meta (Dict[str, Any]): Optional model meta.
    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
        a new Vocab object will be created.
@ -642,8 +642,8 @@ def load_config(
            sys.stdin.read(), overrides=overrides, interpolate=interpolate
        )
    else:
-        if not config_path or not config_path.exists() or not config_path.is_file():
-            raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
+        if not config_path or not config_path.is_file():
+            raise IOError(Errors.E053.format(path=config_path, name="config file"))
        return config.from_disk(
            config_path, overrides=overrides, interpolate=interpolate
        )
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -148,8 +148,8 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [

 ### init fill-config {#init-fill-config new="3"}

-Auto-fill a partial [`config.cfg` file](/usage/training#config) file with **all
-default values**, e.g. a config generated with the
+Auto-fill a partial [.cfg file](/usage/training#config) with **all default
+values**, e.g. a config generated with the
 [quickstart widget](/usage/training#quickstart). Config files used for training
 should always be complete and not contain any hidden defaults or missing values,
 so this command helps you create your final training config. In order to find
@ -175,7 +175,7 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff]
 | Name                   | Description                                                                                                                                                                          |
 | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `base_path`            | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~                                                            |
-| `output_file`          | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~                                                  |
+| `output_file`          | Path to output `.cfg` file or "-" to write to stdout so you can pipe it to a file. Defaults to "-" (stdout). ~~Path (positional)~~                                                   |
 | `--code`, `-c`         | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
 | `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                    |
 | `--diff`, `-D`         | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                                                                        |
@ -208,7 +208,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
 | `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               |
 | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  |
 | `--prune`, `-p`    | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~                                                                                                                                                                     |
-| `--mode`, `-m`     | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~                                                                                                                                                   |
+| `--mode`, `-m`     | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~                                                                                                                                      |
 | `--name`, `-n`     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~                                                                                                                                                   |
 | `--verbose`, `-V`  | Print additional information and explanations. ~~bool (flag)~~                                                                                                                                                                                                      |
 | `--help`, `-h`     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                          |
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@ -535,7 +535,7 @@ As of spaCy v3.0, the `meta.json` **isn't** used to construct the language class
 and pipeline anymore and only contains meta information for reference and for
 creating a Python package with [`spacy package`](/api/cli#package). How to set
 up the `nlp` object is now defined in the
-[`config.cfg`](/api/data-formats#config), which includes detailed information
+[config file](/api/data-formats#config), which includes detailed information
 about the pipeline components and their model architectures, and all other
 settings and hyperparameters used to train the pipeline. It's the **single
 source of truth** used for loading a pipeline.
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -1,5 +1,43 @@
 {
    "resources": [
+        {
+            "id": "spacypdfreader",
+            "title": "spadypdfreader",
+            "category": ["pipeline"],
+            "tags": ["PDF"],
+            "slogan": "Easy PDF to text to spaCy text extraction in Python.",
+            "description": "*spacypdfreader* is a Python library that allows you to convert PDF files directly into *spaCy* `Doc` objects. The library provides several built in parsers or bring your own parser. `Doc` objects are annotated with several custom attributes including: `token._.page_number`, `doc._.page_range`, `doc._.first_page`, `doc._.last_page`, `doc._.pdf_file_name`, and `doc._.page(int)`.",
+            "github": "SamEdwardes/spacypdfreader",
+            "pip": "spacypdfreader",
+            "url": "https://samedwardes.github.io/spacypdfreader/",
+            "code_language": "python",
+            "author": "Sam Edwardes",
+            "author_links": {
+                "twitter": "TheReaLSamlam",
+                "github": "SamEdwardes",
+                "website": "https://samedwardes.com"
+            },
+            "code_example": [
+                "import spacy",
+                "from spacypdfreader import pdf_reader",
+                "",
+                "nlp = spacy.load('en_core_web_sm')",
+                "doc = pdf_reader('tests/data/test_pdf_01.pdf', nlp)",
+                "",
+                "# Get the page number of any token.",
+                "print(doc[0]._.page_number)  # 1",
+                "print(doc[-1]._.page_number) # 4",
+                "",
+                "# Get page meta data about the PDF document.",
+                "print(doc._.pdf_file_name)   # 'tests/data/test_pdf_01.pdf'",
+                "print(doc._.page_range)      # (1, 4)",
+                "print(doc._.first_page)      # 1",
+                "print(doc._.last_page)       # 4",
+                "",
+                "# Get all of the text from a specific PDF page.",
+                "print(doc._.page(4))         # 'able to display the destination page (unless...'"
+            ]
+        },
        {
            "id": "nlpcloud",
            "title": "NLPCloud.io",
@ -26,32 +64,6 @@
            "category": ["apis", "nonpython", "standalone"],
            "tags": ["api", "deploy", "production"]
        },
-        {
-            "id": "denomme",
-            "title": "denomme : Multilingual Name Detector",
-            "slogan": "Multilingual Name Detection",
-            "description": "A SpaCy extension for Spans to extract multilingual names out of documents trained on XLM-roberta backbone",
-            "github": "meghanabhange/denomme",
-            "pip": "denomme https://denomme.s3.us-east-2.amazonaws.com/xx_denomme-0.3.1/dist/xx_denomme-0.3.1.tar.gz",
-            "code_example": [
-                "from spacy.lang.xx import MultiLanguage",
-                "from denomme.name import person_name_component",
-                "nlp = MultiLanguage()",
-                "nlp.add_pipe('denomme')",
-                "doc = nlp('Hi my name is Meghana S.R Bhange and I want to talk Asha')",
-                "print(doc._.person_name)",
-                "# ['Meghana S.R Bhange', 'Asha']"
-            ],
-            "thumb": "https://i.ibb.co/jwGVWPZ/rainbow-bohemian-logo-removebg-preview.png",
-            "code_language": "python",
-            "author": "Meghana Bhange",
-            "author_links": {
-                "github": "meghanabhange",
-                "twitter": "_aspiringcat"
-            },
-            "category": ["standalone"],
-            "tags": ["person-name-detection"]
-        },
        {
            "id": "eMFDscore",
            "title": "eMFDscore : Extended Moral Foundation Dictionary Scoring for Python",