From a23f487b06f3ed1ba1166e32bd1afadbca8b3e1f Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 22 May 2017 18:48:20 +0200 Subject: [PATCH 1/3] Tidy up displaCy and add "manual" option Also don't require title in EntityRenderer --- spacy/displacy/__init__.py | 29 ++++++++++++++++------------- spacy/displacy/render.py | 2 +- website/docs/api/displacy.jade | 18 ++++++++++++++++++ website/docs/usage/visualizers.jade | 24 ++++++++---------------- 4 files changed, 43 insertions(+), 30 deletions(-) diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index f338a2e6c..b27370909 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -10,27 +10,28 @@ _html = {} IS_JUPYTER = is_in_jupyter() -def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, options={}): +def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, + options={}, manual=False): """Render displaCy visualisation. docs (list or Doc): Document(s) to visualise. style (unicode): Visualisation style, 'dep' or 'ent'. page (bool): Render markup as full HTML page. minify (bool): Minify HTML markup. - jupyter (bool): Experimental, use Jupyter's display() to output markup. + jupyter (bool): Experimental, use Jupyter's `display()` to output markup. options (dict): Visualiser-specific options, e.g. colors. + manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts. RETURNS (unicode): Rendered HTML markup. """ - if isinstance(docs, Doc): - docs = [docs] - if style == 'dep': - renderer = DependencyRenderer(options=options) - parsed = [parse_deps(doc, options) for doc in docs] - elif style == 'ent': - renderer = EntityRenderer(options=options) - parsed = [parse_ents(doc, options) for doc in docs] - else: + factories = {'dep': (DependencyRenderer, parse_deps), + 'ent': (EntityRenderer, parse_ents)} + if style not in factories: raise ValueError("Unknown style: %s" % style) + if isinstance(docs, Doc) or isinstance(docs, dict): + docs = [docs] + renderer, converter = factories[style] + renderer = renderer(options=options) + parsed = [converter(doc, options) for doc in docs] if not manual else docs _html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip() html = _html['parsed'] if jupyter: # return HTML rendered by IPython display() @@ -39,7 +40,8 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, opti return html -def serve(docs, style='dep', page=True, minify=False, options={}, port=5000): +def serve(docs, style='dep', page=True, minify=False, options={}, manual=False, + port=5000): """Serve displaCy visualisation. docs (list or Doc): Document(s) to visualise. @@ -47,10 +49,11 @@ def serve(docs, style='dep', page=True, minify=False, options={}, port=5000): page (bool): Render markup as full HTML page. minify (bool): Minify HTML markup. options (dict): Visualiser-specific options, e.g. colors. + manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts. port (int): Port to serve visualisation. """ from wsgiref import simple_server - render(docs, style=style, page=page, minify=minify, options=options) + render(docs, style=style, page=page, minify=minify, options=options, manual=manual) httpd = simple_server.make_server('0.0.0.0', port, app) prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port) httpd.serve_forever() diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 6a786437a..e9b792881 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -175,7 +175,7 @@ class EntityRenderer(object): minify (bool): Minify HTML markup. RETURNS (unicode): Rendered HTML markup. """ - rendered = [self.render_ents(p['text'], p['ents'], p['title']) for p in parsed] + rendered = [self.render_ents(p['text'], p['ents'], p.get('title', None)) for p in parsed] if page: docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered]) markup = TPL_PAGE.format(content=docs) diff --git a/website/docs/api/displacy.jade b/website/docs/api/displacy.jade index a14671b4a..a5352ade8 100644 --- a/website/docs/api/displacy.jade +++ b/website/docs/api/displacy.jade @@ -54,6 +54,15 @@ p +cell #[+a("#options") Visualizer-specific options], e.g. colors. +cell #[code {}] + +row + +cell #[code manual] + +cell bool + +cell + | Don't parse #[code Doc] and instead, expect a dict or list of + | dicts. #[+a("/docs/usage/visualizers#manual-usage") See here] + | for formats and examples. + +cell #[code False] + +row +cell #[code port] +cell int @@ -111,6 +120,15 @@ p Render a dependency parse tree or named entity visualization. +cell #[+a("#options") Visualizer-specific options], e.g. colors. +cell #[code {}] + +row + +cell #[code manual] + +cell bool + +cell + | Don't parse #[code Doc] and instead, expect a dict or list of + | dicts. #[+a("/docs/usage/visualizers#manual-usage") See here] + | for formats and examples. + +cell #[code False] + +footrow +cell returns +cell unicode diff --git a/website/docs/usage/visualizers.jade b/website/docs/usage/visualizers.jade index ea675e70c..93a4b5567 100644 --- a/website/docs/usage/visualizers.jade +++ b/website/docs/usage/visualizers.jade @@ -287,24 +287,17 @@ p | #[+a("http://www.nltk.org") NLTK] or | #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet]. | Simply convert the dependency parse or recognised entities to displaCy's - | format and import #[code DependencyRenderer] or #[code EntityRenderer] - | from #[code spacy.displacy.render]. A renderer class can be is initialised - | with a dictionary of options. To generate the visualization markup, call - | the renderer's #[code render()] method on a list of dictionaries (one - | per visualization). - + | format and set #[code manual=True] on either #[code render()] or + | #[code serve()]. +aside-code("Example"). - from spacy.displacy.render import EntityRenderer - ex = [{'text': 'But Google is starting from behind.', 'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}], 'title': None}] - renderer = EntityRenderer() - html = renderer.render(ex) + html = displacy.render(ex, style='ent', manual=True) -+code("DependencyRenderer input"). - [{ ++code("DEP input"). + { 'words': [ {'text': 'This', 'tag': 'DT'}, {'text': 'is', 'tag': 'VBZ'}, @@ -314,11 +307,10 @@ p {'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'}, {'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'}, {'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}] - }] + } -+code("EntityRenderer input"). - [{ ++code("ENT input"). + { 'text': 'But Google is starting from behind.', 'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}], 'title': None - }] From 701cba1524ed9aaf501d979a461358dcb25d22a6 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 22 May 2017 18:53:14 +0200 Subject: [PATCH 2/3] Update models documentation with notes --- website/docs/usage/models.jade | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index eb63cd0bb..2dec5197e 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -101,6 +101,9 @@ p | directory. You can then use #[code spacy.load()] to load it via its | package name, create a #[+a("#usage-link") shortcut link] to assign it a | custom name, or #[+a("usage-import") import it] explicitly as a module. + | If you need to download models as part of an automated process, we + | recommend using pip with a direct link, instead of relying on spaCy's + | #[+api("cli#download") #[code download]] command. +h(3, "download-manual") Manual download and installation @@ -162,6 +165,14 @@ p | The #[+api("cli#link") #[code link]] command will create a symlink | in the #[code spacy/data] directory. ++aside("Why does spaCy use symlinks?") + | Symlinks were originally introduced to maintain backwards compatibility, + | as older versions expected model data to live within #[code spacy/data]. + | However, we decided to keep using them in v2.0 instead of opting for + | a config file. There'll always be a need for assigning and saving custom + | model names or IDs. And your system already comes with a native solution + | to mapping unicode aliases to file paths: symbolic links. + +code(false, "bash"). python -m spacy link [package name or path] [shortcut] [--force] @@ -179,7 +190,7 @@ p python -m spacy link /Users/you/model my_amazing_model +infobox("Important note") - | In order to create a symlink, your user needs the required permissions. + | In order to create a symlink, your user needs the #[strong required permissions]. | If you've installed spaCy to a system directory and don't have admin | privileges, the #[code spacy link] command may fail. The easiest solution | is to re-run the command as admin, or use a #[code virtualenv]. For more @@ -189,16 +200,26 @@ p +h(3, "usage-import") Importing models as modules p - | If you've installed a model via pip, you can also #[code import] it - | directly and then call its #[code load()] method with no arguments: + | If you've installed a model via spaCy's downloader, or directly via pip, + | you can also #[code import] it and then call its #[code load()] method + | with no arguments: +code. - import spacy import en_core_web_md nlp = en_core_web_md.load() doc = nlp(u'This is a sentence.') +p + | How you choose to load your models ultimately depends on personal + | preference. However, #[strong for larger code bases], we usually recommend + | native imports, as this will make it easier to integrate models with your + | existing build process, continuous integration workflow and testing + | framework. It'll also prevent you from ever trying to load a model that + | is not installed, as your code will raise an #[code ImportError] + | immediately, instead of failing somewhere down the line when calling + | #[code spacy.load()]. + +h(2, "own-models") Using your own models p From 4cd26bcb83ab0abebb6c85a2f4812909753d5eae Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 22 May 2017 19:04:02 +0200 Subject: [PATCH 3/3] Update docs on rule-based matching and add examples --- website/docs/usage/rule-based-matching.jade | 150 +++++++++++++++++--- 1 file changed, 129 insertions(+), 21 deletions(-) diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade index ef26f69b6..ae9e4d086 100644 --- a/website/docs/usage/rule-based-matching.jade +++ b/website/docs/usage/rule-based-matching.jade @@ -20,13 +20,13 @@ p +list("numbers") +item - | A token whose #[strong lower-case form matches "hello"], e.g. "Hello" + | A token whose #[strong lowercase form matches "hello"], e.g. "Hello" | or "HELLO". +item | A token whose #[strong #[code is_punct] flag is set to #[code True]], | i.e. any punctuation. +item - | A token whose #[strong lower-case form matches "world"], e.g. "World" + | A token whose #[strong lowercase form matches "world"], e.g. "World" | or "WORLD". +code. @@ -95,10 +95,6 @@ p nlp = spacy.load('en') matcher = Matcher(nlp.vocab) - matcher.add('GoogleIO', on_match=add_event_ent, - [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}], - [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}, {'IS_DIGIT': True}]) - # Get the ID of the 'EVENT' entity type. This is required to set an entity. EVENT = nlp.vocab.strings['EVENT'] @@ -108,6 +104,10 @@ p match_id, start, end = matches[i] doc.ents += ((EVENT, start, end),) + matcher.add('GoogleIO', on_match=add_event_ent, + [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}], + [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}, {'IS_DIGIT': True}]) + p | In addition to mentions of "Google I/O", your data also contains some | annoying pre-processing artefacts, like leftover HTML line breaks @@ -117,10 +117,6 @@ p | function #[code merge_and_flag]: +code. - matcher.add('BAD_HTML', on_match=merge_and_flag, - [{'ORTH': '<'}, {'LOWER': 'br'}, {'ORTH': '>'}], - [{'ORTH': '<'}, {'LOWER': 'br/'}, {'ORTH': '>'}]) - # Add a new custom flag to the vocab, which is always False by default. # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span. BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False) @@ -131,6 +127,10 @@ p span.merge(is_stop=True) # merge (and mark it as a stop word, just in case) span.set_flag(BAD_HTML_FLAG, True) # set BAD_HTML_FLAG + matcher.add('BAD_HTML', on_match=merge_and_flag, + [{'ORTH': '<'}, {'LOWER': 'br'}, {'ORTH': '>'}], + [{'ORTH': '<'}, {'LOWER': 'br/'}, {'ORTH': '>'}]) + +aside("Tip: Visualizing matches") | When working with entities, you can use #[+api("displacy") displaCy] | to quickly generate a NER visualization from your updated #[code Doc], @@ -146,18 +146,16 @@ p p | We can now call the matcher on our documents. The patterns will be - | matched in the order they occur in the text. + | matched in the order they occur in the text. The matcher will then + | iterate over the matches, look up the callback for the match ID + | that was matched, and invoke it. +code. doc = nlp(LOTS_OF_TEXT) matcher(doc) -+h(3, "on_match-callback") The callback function - p - | The matcher will first collect all matches over the document. It will - | then iterate over the matches, lookup the callback for the entity ID - | that was matched, and invoke it. When the callback is invoked, it is + | When the callback is invoked, it is | passed four arguments: the matcher itself, the document, the position of | the current match, and the total list of matches. This allows you to | write callbacks that consider the entire set of matched phrases, so that @@ -185,11 +183,24 @@ p +cell | A list of #[code (match_id, start, end)] tuples, describing the | matches. A match tuple describes a span #[code doc[start:end]]. - | The #[code match_id] is the ID of the added match pattern. -+h(2, "quantifiers") Using quantifiers ++h(2, "quantifiers") Using operators and quantifiers -+table([ "Name", "Description", "Example"]) +p + | The matcher also lets you use quantifiers, specified as the #[code 'OP'] + | key. Quantifiers let you define sequences of tokens to be mached, e.g. + | one or more punctuation marks, or specify optional tokens. Note that there + | are no nested or scoped quantifiers – instead, you can build those + | behaviours with #[code on_match] callbacks. + ++aside("Problems with quantifiers") + | Using quantifiers may lead to unexpected results when matching + | variable-length patterns, for example if the next token would also be + | matched by the previous token. This problem should be resolved in a future + | release. For more information, see + | #[+a(gh("spaCy") + "/issues/864") this issue]. + ++table([ "OP", "Description", "Example"]) +row +cell #[code !] +cell match exactly 0 times @@ -210,6 +221,103 @@ p +cell match 0 or 1 times +cell optional, max one ++h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations + p - | There are no nested or scoped quantifiers. You can build those - | behaviours with #[code on_match] callbacks. + | Let's say you're analysing user comments and you want to find out what + | people are saying about Facebook. You want to start off by finding + | adjectives following "Facebook is" or "Facebook was". This is obviously + | a very rudimentary solution, but it'll be fast, and a great way get an + | idea for what's in your data. Your pattern could look like this: + ++code. + [{'LOWER': 'facebook'}, {'LEMMA': 'be'}, {'POS': 'ADV', 'OP': '*'}, {'POS': 'ADJ'}] + +p + | This translates to a token whose lowercase form matches "facebook" + | (like Facebook, facebook or FACEBOOK), followed by a token with the lemma + | "be" (for example, is, was, or 's), followed by an #[strong optional] adverb, + | followed by an adjective. Using the linguistic annotations here is + | especially useful, because you can tell spaCy to match "Facebook's + | annoying", but #[strong not] "Facebook's annoying ads". The optional + | adverb makes sure you won't miss adjectives with intensifiers, like + | "pretty awful" or "very nice". + +p + | To get a quick overview of the results, you could collect all sentences + | containing a match and render them with the + | #[+a("/docs/usage/visualizers") displaCy visualizer]. + | In the callback function, you'll have access to the #[code start] and + | #[code end] of each match, as well as the parent #[code Doc]. This lets + | you determine the sentence containing the match, + | #[code doc[start : end].sent], and calculate the start and end of the + | matched span within the sentence. Using displaCy in + | #[+a("/docs/usage/visualizers#manual-usage") "manual" mode] lets you + | pass in a list of dictionaries containing the text and entities to render. + ++code. + from spacy import displacy + from spacy.matcher import Matcher + + nlp = spacy.load('en') + matcher = Matcher(nlp.vocab) + matched_sents = [] # collect data of matched sentences to be visualized + + def collect_sents(matcher, doc, i, matches): + match_id, start, end = matches[i] + span = doc[start : end] # matched span + sent = span.sent # sentence containing matched span + # append mock entity for match in displaCy style to matched_sents + # get the match span by ofsetting the start and end of the span with the + # start and end of the sentence in the doc + match_ents = [{'start': span.start-sent.start, 'end': span.end-sent.start, + 'label': 'MATCH'}] + matched_sents.append({'text': sent.text, 'ents': match_ents }) + + pattern = [{'LOWER': 'facebook'}, {'LEMMA': 'be'}, {'POS': 'ADV', 'OP': '*'}, + {'POS': 'ADJ'}] + matcher.add('FacebookIs', collect_sents, pattern) # add pattern + matches = matcher(nlp(LOTS_OF_TEXT)) # match on your text + + # serve visualization of sentences containing match with displaCy + # set manual=True to make displaCy render straight from a dictionary + displacy.serve(matched_sents, style='ent', manual=True) + + ++h(3, "quantifiers-example2") Quantifiers example: Phone numbers + +p + | Phone numbers can have many different formats and matching them is often + | tricky. During tokenization, spaCy will leave sequences of numbers intact + | and only split on whitespace and punctuation. This means that your match + | pattern will have to look out for number sequences of a certain length, + | surrounded by specific punctuation – depending on the + | #[+a("https://en.wikipedia.org/wiki/National_conventions_for_writing_telephone_numbers") national conventions]. + +p + | The #[code IS_DIGIT] flag is not very helpful here, because it doesn't + | tell us anything about the length. However, you can use the #[code SHAPE] + | flag, with each #[code d] representing a digit: + ++code. + [{'ORTH': '('}, {'SHAPE': 'ddd'}, {'ORTH': ')'}, {'SHAPE': 'dddd'}, + {'ORTH': '-', 'OP': '?'}, {'SHAPE': 'dddd'}] + +p + | This will match phone numbers of the format #[strong (123) 4567 8901] or + | #[strong (123) 4567-8901]. To also match formats like #[strong (123) 456 789], + | you can add a second pattern using #[code 'ddd'] in place of #[code 'dddd']. + | By hard-coding some values, you can match only certain, country-specific + | numbers. For example, here's a pattern to match the most common formats of + | #[+a("https://en.wikipedia.org/wiki/National_conventions_for_writing_telephone_numbers#Germany") international German numbers]: + ++code. + [{'ORTH': '+'}, {'ORTH': '49'}, {'ORTH': '(', 'OP': '?'}, {'SHAPE': 'dddd'}, + {'ORTH': ')', 'OP': '?'}, {'SHAPE': 'dddddd'}] + +p + | Depending on the formats your application needs to match, creating an + | extensive set of rules like this is often better than training a model. + | It'll produce more predictable results, is much easier to modify and + | extend, and doesn't require any training data – only a set of + | test cases.