read entity_ruler patterns with srsly.read_jsonl.v1

This commit is contained in:
svlandeg 2020-10-05 22:50:14 +02:00
parent 193e0d5a98
commit ff9ac39c88
7 changed files with 44 additions and 12 deletions

View File

@ -1410,7 +1410,9 @@ class Language:
kwargs = component_cfg.get(name, {}) kwargs = component_cfg.get(name, {})
# Allow component_cfg to overwrite the top-level kwargs. # Allow component_cfg to overwrite the top-level kwargs.
kwargs.setdefault("batch_size", batch_size) kwargs.setdefault("batch_size", batch_size)
if hasattr(proc, "pipe"): # non-trainable components may have a pipe() implementation that refers to dummy
# predict and set_annotations methods
if hasattr(proc, "pipe") and hasattr(proc, "is_trainable") and proc.is_trainable():
f = functools.partial(proc.pipe, **kwargs) f = functools.partial(proc.pipe, **kwargs)
else: else:
# Apply the function, but yield the doc # Apply the function, but yield the doc

View File

@ -1,4 +1,4 @@
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
from collections import defaultdict from collections import defaultdict
from pathlib import Path from pathlib import Path
import srsly import srsly
@ -190,19 +190,18 @@ class EntityRuler(Pipe):
get_examples: Callable[[], Iterable[Example]], get_examples: Callable[[], Iterable[Example]],
*, *,
nlp: Optional[Language] = None, nlp: Optional[Language] = None,
patterns_path: Optional[Path] = None patterns: Optional[Sequence[PatternType]] = None,
): ):
"""Initialize the pipe for training. """Initialize the pipe for training.
get_examples (Callable[[], Iterable[Example]]): Function that get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects. returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of. nlp (Language): The current nlp object the component is part of.
patterns_path: Path to serialized patterns. patterns Optional[Iterable[PatternType]]: The list of patterns.
DOCS: https://nightly.spacy.io/api/entityruler#initialize DOCS: https://nightly.spacy.io/api/entityruler#initialize
""" """
if patterns_path: if patterns:
patterns = srsly.read_jsonl(patterns_path)
self.add_patterns(patterns) self.add_patterns(patterns)

View File

@ -437,7 +437,9 @@ cdef class Parser(Pipe):
for name, component in nlp.pipeline: for name, component in nlp.pipeline:
if component is self: if component is self:
break break
if hasattr(component, "pipe"): # non-trainable components may have a pipe() implementation that refers to dummy
# predict and set_annotations methods
if hasattr(component, "pipe") and hasattr(component, "is_trainable") and component.is_trainable():
doc_sample = list(component.pipe(doc_sample, batch_size=8)) doc_sample = list(component.pipe(doc_sample, batch_size=8))
else: else:
doc_sample = [component(doc) for doc in doc_sample] doc_sample = [component(doc) for doc in doc_sample]

View File

@ -119,7 +119,7 @@ def validate_init_settings(
if types don't match or required values are missing. if types don't match or required values are missing.
func (Callable): The initialize method of a given component etc. func (Callable): The initialize method of a given component etc.
settings (Dict[str, Any]): The settings from the repsective [initialize] block. settings (Dict[str, Any]): The settings from the respective [initialize] block.
section (str): Initialize section, for error message. section (str): Initialize section, for error message.
name (str): Name of the block in the section. name (str): Name of the block in the section.
exclude (Iterable[str]): Parameter names to exclude from schema. exclude (Iterable[str]): Parameter names to exclude from schema.

View File

@ -121,7 +121,7 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
assert doc.has_annotation("LEMMA") assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH") assert doc.has_annotation("MORPH")
nlp.remove_pipe("attribute_ruler") nlp.remove_pipe("attribute_ruler")
# initialize with patterns from asset # initialize with patterns from misc registry
nlp.config["initialize"]["components"]["attribute_ruler"] = { nlp.config["initialize"]["components"]["attribute_ruler"] = {
"patterns": {"@misc": "attribute_ruler_patterns"} "patterns": {"@misc": "attribute_ruler_patterns"}
} }

View File

@ -1,4 +1,6 @@
import pytest import pytest
from spacy import registry
from spacy.tokens import Span from spacy.tokens import Span
from spacy.language import Language from spacy.language import Language
from spacy.pipeline import EntityRuler from spacy.pipeline import EntityRuler
@ -11,6 +13,7 @@ def nlp():
@pytest.fixture @pytest.fixture
@registry.misc("entity_ruler_patterns")
def patterns(): def patterns():
return [ return [
{"label": "HELLO", "pattern": "hello world"}, {"label": "HELLO", "pattern": "hello world"},
@ -42,6 +45,29 @@ def test_entity_ruler_init(nlp, patterns):
assert doc.ents[1].label_ == "BYE" assert doc.ents[1].label_ == "BYE"
def test_entity_ruler_init_patterns(nlp, patterns):
# initialize with patterns
ruler = nlp.add_pipe("entity_ruler")
assert len(ruler.labels) == 0
ruler.initialize(lambda: [], patterns=patterns)
assert len(ruler.labels) == 4
doc = nlp("hello world bye bye")
assert doc.ents[0].label_ == "HELLO"
assert doc.ents[1].label_ == "BYE"
nlp.remove_pipe("entity_ruler")
# initialize with patterns from misc registry
nlp.config["initialize"]["components"]["entity_ruler"] = {
"patterns": {"@misc": "entity_ruler_patterns"}
}
ruler = nlp.add_pipe("entity_ruler")
assert len(ruler.labels) == 0
nlp.initialize()
assert len(ruler.labels) == 4
doc = nlp("hello world bye bye")
assert doc.ents[0].label_ == "HELLO"
assert doc.ents[1].label_ == "BYE"
def test_entity_ruler_existing(nlp, patterns): def test_entity_ruler_existing(nlp, patterns):
ruler = nlp.add_pipe("entity_ruler") ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)

View File

@ -82,13 +82,16 @@ Initialize the component with patterns from a file.
> >
> ```python > ```python
> entity_ruler = nlp.add_pipe("entity_ruler") > entity_ruler = nlp.add_pipe("entity_ruler")
> entity_ruler.initialize(lambda: [], nlp=nlp, patterns_path=patterns_path) > entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
> ``` > ```
> >
> ```ini > ```ini
> ### config.cfg > ### config.cfg
> [initialize.components.entity_ruler] > [initialize.components.entity_ruler]
> patterns_path = "data/patterns/patterns.jsonl" >
> [initialize.components.entity_ruler.patterns]
> @readers = "srsly.read_jsonl.v1"
> path = "corpus/entity_ruler_patterns.jsonl
> ``` > ```
| Name | Description | | Name | Description |
@ -96,7 +99,7 @@ Initialize the component with patterns from a file.
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `labels` | Path to the .json file holding the serialized patterns. ~~Path~~ | | `patterns` | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~ |
## EntityRuler.\_\len\_\_ {#len tag="method"} ## EntityRuler.\_\len\_\_ {#len tag="method"}