mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-22 15:24:11 +03:00
38 lines
1.0 KiB
Python
38 lines
1.0 KiB
Python
#!/usr/bin/env python
|
|
# coding: utf8
|
|
"""Demonstrate adding a rule-based component that forces some tokens to not
|
|
be entities, before the NER tagger is applied. This is used to hotfix the issue
|
|
in https://github.com/explosion/spaCy/issues/2870, present as of spaCy v2.0.16.
|
|
|
|
Compatible with: spaCy v2.0.0+
|
|
Last tested with: v2.1.0
|
|
"""
|
|
from __future__ import unicode_literals
|
|
|
|
import spacy
|
|
from spacy.attrs import ENT_IOB
|
|
|
|
|
|
def fix_space_tags(doc):
|
|
ent_iobs = doc.to_array([ENT_IOB])
|
|
for i, token in enumerate(doc):
|
|
if token.is_space:
|
|
# Sets 'O' tag (0 is None, so I is 1, O is 2)
|
|
ent_iobs[i] = 2
|
|
doc.from_array([ENT_IOB], ent_iobs.reshape((len(doc), 1)))
|
|
return doc
|
|
|
|
|
|
def main():
|
|
nlp = spacy.load("en_core_web_sm")
|
|
text = "This is some crazy test where I dont need an Apple Watch to make things bug"
|
|
doc = nlp(text)
|
|
print("Before", doc.ents)
|
|
nlp.add_pipe(fix_space_tags, name="fix-ner", before="ner")
|
|
doc = nlp(text)
|
|
print("After", doc.ents)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|