Fix bad HTML example (see #2376) and turn it into section on matcher + components

Avoid problems caused by merging while matching (e.g. index errors). Creating a Matcher component also better reflects the recommended best practices.
This commit is contained in:
ines 2018-05-26 17:57:02 +02:00
parent 8adb967e0c
commit fb923b31ea

View File

@ -260,41 +260,6 @@ p
doc = nlp(u"This is a text about Google I/O 2015.") doc = nlp(u"This is a text about Google I/O 2015.")
matches = matcher(doc) matches = matcher(doc)
p
| In addition to mentions of "Google I/O", your data also contains some
| annoying pre-processing artefacts, like leftover HTML line breaks
| (e.g. #[code <br>] or #[code <BR/>]). While you're at it,
| you want to merge those into one token and flag them, to make sure you
| can easily ignore them later. So you add a second pattern and pass in a
| function #[code merge_and_flag]:
+code-exec.
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Token
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
# register a new token extension to flag bad HTML
Token.set_extension('bad_html', default=False)
def merge_and_flag(matcher, doc, i, matches):
match_id, start, end = matches[i]
span = doc[start : end]
span.merge(is_stop=True) # merge (and mark it as a stop word, just in case)
for token in span:
token._.bad_html = True # mark token as bad HTML
print(span.text)
matcher.add('BAD_HTML', merge_and_flag,
[{'ORTH': '<'}, {'LOWER': 'br'}, {'ORTH': '>'}],
[{'ORTH': '<'}, {'LOWER': 'br/'}, {'ORTH': '>'}])
doc = nlp(u"Hello<br>world!")
matches = matcher(doc)
for token in doc:
print(token.text, token._.bad_html)
+aside("Tip: Visualizing matches") +aside("Tip: Visualizing matches")
| When working with entities, you can use #[+api("top-level#displacy") displaCy] | When working with entities, you can use #[+api("top-level#displacy") displaCy]
| to quickly generate a NER visualization from your updated #[code Doc], | to quickly generate a NER visualization from your updated #[code Doc],
@ -315,7 +280,7 @@ p
| that was matched, and invoke it. | that was matched, and invoke it.
+code. +code.
doc = nlp(LOTS_OF_TEXT) doc = nlp(YOUR_TEXT_HERE)
matcher(doc) matcher(doc)
p p
@ -348,6 +313,69 @@ p
| A list of #[code (match_id, start, end)] tuples, describing the | A list of #[code (match_id, start, end)] tuples, describing the
| matches. A match tuple describes a span #[code doc[start:end]]. | matches. A match tuple describes a span #[code doc[start:end]].
+h(3, "matcher-pipeline") Using custom pipeline components
p
| Let's say your data also contains some annoying pre-processing artefacts,
| like leftover HTML line breaks (e.g. #[code <br>] or
| #[code <BR/>]). To make your text easier to analyse, you want to
| merge those into one token and flag them, to make sure you
| can ignore them later. Ideally, this should all be done automatically
| as you process the text. You can achieve this by adding a
| #[+a("/usage/processing-pipelines#custom-components") custom pipeline component]
| that's called on each #[code Doc] object, merges the leftover HTML spans
| and sets an attribute #[code bad_html] on the token.
+code-exec.
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Token
# we're using a class because the component needs to be initialised with
# the shared vocab via the nlp object
class BadHTMLMerger(object):
def __init__(self, nlp):
# register a new token extension to flag bad HTML
Token.set_extension('bad_html', default=False)
self.matcher = Matcher(nlp.vocab)
self.matcher.add('BAD_HTML', None,
[{'ORTH': '<'}, {'LOWER': 'br'}, {'ORTH': '>'}],
[{'ORTH': '<'}, {'LOWER': 'br/'}, {'ORTH': '>'}])
def __call__(self, doc):
# this method is invoked when the component is called on a Doc
matches = self.matcher(doc)
spans = [] # collect the matched spans here
for match_id, start, end in matches:
spans.append(doc[start:end])
for span in spans:
span.merge(is_stop=True) # merge (and mark it as a stop word)
for token in span:
token._.bad_html = True # mark token as bad HTML
return doc
nlp = spacy.load('en_core_web_sm')
html_merger = BadHTMLMerger(nlp)
nlp.add_pipe(html_merger, last=True) # add component to the pipeline
doc = nlp(u"Hello<br>world! <br/> This is a test.")
for token in doc:
print(token.text, token._.bad_html)
p
| Instead of hard-coding the patterns into the component, you could also
| make it take a path to a JSON file containing the patterns. This lets
| you reuse the component with different patterns, depending on your
| application:
+code.
html_merger = BadHTMLMerger(nlp, path='/path/to/patterns.json')
+infobox
| For more details and examples of how to
| #[strong create custom pipeline components] and
| #[strong extension attributes], see the
| #[+a("/usage/processing-pipelines") usage guide].
+h(3, "regex") Using regular expressions +h(3, "regex") Using regular expressions
p p