mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-03 10:55:52 +03:00
read entity_ruler patterns with srsly.read_jsonl.v1
This commit is contained in:
parent
193e0d5a98
commit
ff9ac39c88
|
@ -1410,7 +1410,9 @@ class Language:
|
||||||
kwargs = component_cfg.get(name, {})
|
kwargs = component_cfg.get(name, {})
|
||||||
# Allow component_cfg to overwrite the top-level kwargs.
|
# Allow component_cfg to overwrite the top-level kwargs.
|
||||||
kwargs.setdefault("batch_size", batch_size)
|
kwargs.setdefault("batch_size", batch_size)
|
||||||
if hasattr(proc, "pipe"):
|
# non-trainable components may have a pipe() implementation that refers to dummy
|
||||||
|
# predict and set_annotations methods
|
||||||
|
if hasattr(proc, "pipe") and hasattr(proc, "is_trainable") and proc.is_trainable():
|
||||||
f = functools.partial(proc.pipe, **kwargs)
|
f = functools.partial(proc.pipe, **kwargs)
|
||||||
else:
|
else:
|
||||||
# Apply the function, but yield the doc
|
# Apply the function, but yield the doc
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable
|
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
|
@ -190,19 +190,18 @@ class EntityRuler(Pipe):
|
||||||
get_examples: Callable[[], Iterable[Example]],
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
*,
|
*,
|
||||||
nlp: Optional[Language] = None,
|
nlp: Optional[Language] = None,
|
||||||
patterns_path: Optional[Path] = None
|
patterns: Optional[Sequence[PatternType]] = None,
|
||||||
):
|
):
|
||||||
"""Initialize the pipe for training.
|
"""Initialize the pipe for training.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
nlp (Language): The current nlp object the component is part of.
|
nlp (Language): The current nlp object the component is part of.
|
||||||
patterns_path: Path to serialized patterns.
|
patterns Optional[Iterable[PatternType]]: The list of patterns.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/entityruler#initialize
|
DOCS: https://nightly.spacy.io/api/entityruler#initialize
|
||||||
"""
|
"""
|
||||||
if patterns_path:
|
if patterns:
|
||||||
patterns = srsly.read_jsonl(patterns_path)
|
|
||||||
self.add_patterns(patterns)
|
self.add_patterns(patterns)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -437,7 +437,9 @@ cdef class Parser(Pipe):
|
||||||
for name, component in nlp.pipeline:
|
for name, component in nlp.pipeline:
|
||||||
if component is self:
|
if component is self:
|
||||||
break
|
break
|
||||||
if hasattr(component, "pipe"):
|
# non-trainable components may have a pipe() implementation that refers to dummy
|
||||||
|
# predict and set_annotations methods
|
||||||
|
if hasattr(component, "pipe") and hasattr(component, "is_trainable") and component.is_trainable():
|
||||||
doc_sample = list(component.pipe(doc_sample, batch_size=8))
|
doc_sample = list(component.pipe(doc_sample, batch_size=8))
|
||||||
else:
|
else:
|
||||||
doc_sample = [component(doc) for doc in doc_sample]
|
doc_sample = [component(doc) for doc in doc_sample]
|
||||||
|
|
|
@ -119,7 +119,7 @@ def validate_init_settings(
|
||||||
if types don't match or required values are missing.
|
if types don't match or required values are missing.
|
||||||
|
|
||||||
func (Callable): The initialize method of a given component etc.
|
func (Callable): The initialize method of a given component etc.
|
||||||
settings (Dict[str, Any]): The settings from the repsective [initialize] block.
|
settings (Dict[str, Any]): The settings from the respective [initialize] block.
|
||||||
section (str): Initialize section, for error message.
|
section (str): Initialize section, for error message.
|
||||||
name (str): Name of the block in the section.
|
name (str): Name of the block in the section.
|
||||||
exclude (Iterable[str]): Parameter names to exclude from schema.
|
exclude (Iterable[str]): Parameter names to exclude from schema.
|
||||||
|
|
|
@ -121,7 +121,7 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||||
assert doc.has_annotation("LEMMA")
|
assert doc.has_annotation("LEMMA")
|
||||||
assert doc.has_annotation("MORPH")
|
assert doc.has_annotation("MORPH")
|
||||||
nlp.remove_pipe("attribute_ruler")
|
nlp.remove_pipe("attribute_ruler")
|
||||||
# initialize with patterns from asset
|
# initialize with patterns from misc registry
|
||||||
nlp.config["initialize"]["components"]["attribute_ruler"] = {
|
nlp.config["initialize"]["components"]["attribute_ruler"] = {
|
||||||
"patterns": {"@misc": "attribute_ruler_patterns"}
|
"patterns": {"@misc": "attribute_ruler_patterns"}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from spacy import registry
|
||||||
from spacy.tokens import Span
|
from spacy.tokens import Span
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.pipeline import EntityRuler
|
from spacy.pipeline import EntityRuler
|
||||||
|
@ -11,6 +13,7 @@ def nlp():
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@registry.misc("entity_ruler_patterns")
|
||||||
def patterns():
|
def patterns():
|
||||||
return [
|
return [
|
||||||
{"label": "HELLO", "pattern": "hello world"},
|
{"label": "HELLO", "pattern": "hello world"},
|
||||||
|
@ -42,6 +45,29 @@ def test_entity_ruler_init(nlp, patterns):
|
||||||
assert doc.ents[1].label_ == "BYE"
|
assert doc.ents[1].label_ == "BYE"
|
||||||
|
|
||||||
|
|
||||||
|
def test_entity_ruler_init_patterns(nlp, patterns):
|
||||||
|
# initialize with patterns
|
||||||
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
|
assert len(ruler.labels) == 0
|
||||||
|
ruler.initialize(lambda: [], patterns=patterns)
|
||||||
|
assert len(ruler.labels) == 4
|
||||||
|
doc = nlp("hello world bye bye")
|
||||||
|
assert doc.ents[0].label_ == "HELLO"
|
||||||
|
assert doc.ents[1].label_ == "BYE"
|
||||||
|
nlp.remove_pipe("entity_ruler")
|
||||||
|
# initialize with patterns from misc registry
|
||||||
|
nlp.config["initialize"]["components"]["entity_ruler"] = {
|
||||||
|
"patterns": {"@misc": "entity_ruler_patterns"}
|
||||||
|
}
|
||||||
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
|
assert len(ruler.labels) == 0
|
||||||
|
nlp.initialize()
|
||||||
|
assert len(ruler.labels) == 4
|
||||||
|
doc = nlp("hello world bye bye")
|
||||||
|
assert doc.ents[0].label_ == "HELLO"
|
||||||
|
assert doc.ents[1].label_ == "BYE"
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_existing(nlp, patterns):
|
def test_entity_ruler_existing(nlp, patterns):
|
||||||
ruler = nlp.add_pipe("entity_ruler")
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
|
|
|
@ -82,13 +82,16 @@ Initialize the component with patterns from a file.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> entity_ruler = nlp.add_pipe("entity_ruler")
|
> entity_ruler = nlp.add_pipe("entity_ruler")
|
||||||
> entity_ruler.initialize(lambda: [], nlp=nlp, patterns_path=patterns_path)
|
> entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
|
||||||
> ```
|
> ```
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> ### config.cfg
|
> ### config.cfg
|
||||||
> [initialize.components.entity_ruler]
|
> [initialize.components.entity_ruler]
|
||||||
> patterns_path = "data/patterns/patterns.jsonl"
|
>
|
||||||
|
> [initialize.components.entity_ruler.patterns]
|
||||||
|
> @readers = "srsly.read_jsonl.v1"
|
||||||
|
> path = "corpus/entity_ruler_patterns.jsonl
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -96,7 +99,7 @@ Initialize the component with patterns from a file.
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
| `labels` | Path to the .json file holding the serialized patterns. ~~Path~~ |
|
| `patterns` | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~ |
|
||||||
|
|
||||||
## EntityRuler.\_\len\_\_ {#len tag="method"}
|
## EntityRuler.\_\len\_\_ {#len tag="method"}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user