read entity_ruler patterns with srsly.read_jsonl.v1

This commit is contained in:
svlandeg 2020-10-05 22:50:14 +02:00
parent 193e0d5a98
commit ff9ac39c88
7 changed files with 44 additions and 12 deletions

View File

@ -1410,7 +1410,9 @@ class Language:
kwargs = component_cfg.get(name, {})
# Allow component_cfg to overwrite the top-level kwargs.
kwargs.setdefault("batch_size", batch_size)
if hasattr(proc, "pipe"):
# non-trainable components may have a pipe() implementation that refers to dummy
# predict and set_annotations methods
if hasattr(proc, "pipe") and hasattr(proc, "is_trainable") and proc.is_trainable():
f = functools.partial(proc.pipe, **kwargs)
else:
# Apply the function, but yield the doc

View File

@ -1,4 +1,4 @@
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
from collections import defaultdict
from pathlib import Path
import srsly
@ -190,19 +190,18 @@ class EntityRuler(Pipe):
get_examples: Callable[[], Iterable[Example]],
*,
nlp: Optional[Language] = None,
patterns_path: Optional[Path] = None
patterns: Optional[Sequence[PatternType]] = None,
):
"""Initialize the pipe for training.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
patterns_path: Path to serialized patterns.
patterns Optional[Iterable[PatternType]]: The list of patterns.
DOCS: https://nightly.spacy.io/api/entityruler#initialize
"""
if patterns_path:
patterns = srsly.read_jsonl(patterns_path)
if patterns:
self.add_patterns(patterns)

View File

@ -437,7 +437,9 @@ cdef class Parser(Pipe):
for name, component in nlp.pipeline:
if component is self:
break
if hasattr(component, "pipe"):
# non-trainable components may have a pipe() implementation that refers to dummy
# predict and set_annotations methods
if hasattr(component, "pipe") and hasattr(component, "is_trainable") and component.is_trainable():
doc_sample = list(component.pipe(doc_sample, batch_size=8))
else:
doc_sample = [component(doc) for doc in doc_sample]

View File

@ -119,7 +119,7 @@ def validate_init_settings(
if types don't match or required values are missing.
func (Callable): The initialize method of a given component etc.
settings (Dict[str, Any]): The settings from the repsective [initialize] block.
settings (Dict[str, Any]): The settings from the respective [initialize] block.
section (str): Initialize section, for error message.
name (str): Name of the block in the section.
exclude (Iterable[str]): Parameter names to exclude from schema.

View File

@ -121,7 +121,7 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH")
nlp.remove_pipe("attribute_ruler")
# initialize with patterns from asset
# initialize with patterns from misc registry
nlp.config["initialize"]["components"]["attribute_ruler"] = {
"patterns": {"@misc": "attribute_ruler_patterns"}
}

View File

@ -1,4 +1,6 @@
import pytest
from spacy import registry
from spacy.tokens import Span
from spacy.language import Language
from spacy.pipeline import EntityRuler
@ -11,6 +13,7 @@ def nlp():
@pytest.fixture
@registry.misc("entity_ruler_patterns")
def patterns():
return [
{"label": "HELLO", "pattern": "hello world"},
@ -42,6 +45,29 @@ def test_entity_ruler_init(nlp, patterns):
assert doc.ents[1].label_ == "BYE"
def test_entity_ruler_init_patterns(nlp, patterns):
# initialize with patterns
ruler = nlp.add_pipe("entity_ruler")
assert len(ruler.labels) == 0
ruler.initialize(lambda: [], patterns=patterns)
assert len(ruler.labels) == 4
doc = nlp("hello world bye bye")
assert doc.ents[0].label_ == "HELLO"
assert doc.ents[1].label_ == "BYE"
nlp.remove_pipe("entity_ruler")
# initialize with patterns from misc registry
nlp.config["initialize"]["components"]["entity_ruler"] = {
"patterns": {"@misc": "entity_ruler_patterns"}
}
ruler = nlp.add_pipe("entity_ruler")
assert len(ruler.labels) == 0
nlp.initialize()
assert len(ruler.labels) == 4
doc = nlp("hello world bye bye")
assert doc.ents[0].label_ == "HELLO"
assert doc.ents[1].label_ == "BYE"
def test_entity_ruler_existing(nlp, patterns):
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)

View File

@ -82,13 +82,16 @@ Initialize the component with patterns from a file.
>
> ```python
> entity_ruler = nlp.add_pipe("entity_ruler")
> entity_ruler.initialize(lambda: [], nlp=nlp, patterns_path=patterns_path)
> entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
> ```
>
> ```ini
> ### config.cfg
> [initialize.components.entity_ruler]
> patterns_path = "data/patterns/patterns.jsonl"
>
> [initialize.components.entity_ruler.patterns]
> @readers = "srsly.read_jsonl.v1"
> path = "corpus/entity_ruler_patterns.jsonl
> ```
| Name | Description |
@ -96,7 +99,7 @@ Initialize the component with patterns from a file.
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `labels` | Path to the .json file holding the serialized patterns. ~~Path~~ |
| `patterns` | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~ |
## EntityRuler.\_\len\_\_ {#len tag="method"}