From a23f487b06f3ed1ba1166e32bd1afadbca8b3e1f Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 22 May 2017 18:48:20 +0200
Subject: [PATCH 1/3] Tidy up displaCy and add "manual" option

Also don't require title in EntityRenderer
---
 spacy/displacy/__init__.py          | 29 ++++++++++++++++-------------
 spacy/displacy/render.py            |  2 +-
 website/docs/api/displacy.jade      | 18 ++++++++++++++++++
 website/docs/usage/visualizers.jade | 24 ++++++++----------------
 4 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index f338a2e6c..b27370909 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -10,27 +10,28 @@ _html = {}
 IS_JUPYTER = is_in_jupyter()
 
 
-def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, options={}):
+def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
+          options={}, manual=False):
     """Render displaCy visualisation.
 
     docs (list or Doc): Document(s) to visualise.
     style (unicode): Visualisation style, 'dep' or 'ent'.
     page (bool): Render markup as full HTML page.
     minify (bool): Minify HTML markup.
-    jupyter (bool): Experimental, use Jupyter's display() to output markup.
+    jupyter (bool): Experimental, use Jupyter's `display()` to output markup.
     options (dict): Visualiser-specific options, e.g. colors.
+    manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts.
     RETURNS (unicode): Rendered HTML markup.
     """
-    if isinstance(docs, Doc):
-        docs = [docs]
-    if style == 'dep':
-        renderer = DependencyRenderer(options=options)
-        parsed = [parse_deps(doc, options) for doc in docs]
-    elif style == 'ent':
-        renderer = EntityRenderer(options=options)
-        parsed = [parse_ents(doc, options) for doc in docs]
-    else:
+    factories = {'dep': (DependencyRenderer, parse_deps),
+                 'ent': (EntityRenderer, parse_ents)}
+    if style not in factories:
         raise ValueError("Unknown style: %s" % style)
+    if isinstance(docs, Doc) or isinstance(docs, dict):
+        docs = [docs]
+    renderer, converter = factories[style]
+    renderer = renderer(options=options)
+    parsed = [converter(doc, options) for doc in docs] if not manual else docs
     _html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()
     html = _html['parsed']
     if jupyter: # return HTML rendered by IPython display()
@@ -39,7 +40,8 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, opti
     return html
 
 
-def serve(docs, style='dep', page=True, minify=False, options={}, port=5000):
+def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
+          port=5000):
     """Serve displaCy visualisation.
 
     docs (list or Doc): Document(s) to visualise.
@@ -47,10 +49,11 @@ def serve(docs, style='dep', page=True, minify=False, options={}, port=5000):
     page (bool): Render markup as full HTML page.
     minify (bool): Minify HTML markup.
     options (dict): Visualiser-specific options, e.g. colors.
+    manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts.
     port (int): Port to serve visualisation.
     """
     from wsgiref import simple_server
-    render(docs, style=style, page=page, minify=minify, options=options)
+    render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
     httpd = simple_server.make_server('0.0.0.0', port, app)
     prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port)
     httpd.serve_forever()
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index 6a786437a..e9b792881 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -175,7 +175,7 @@ class EntityRenderer(object):
         minify (bool): Minify HTML markup.
         RETURNS (unicode): Rendered HTML markup.
         """
-        rendered = [self.render_ents(p['text'], p['ents'], p['title']) for p in parsed]
+        rendered = [self.render_ents(p['text'], p['ents'], p.get('title', None)) for p in parsed]
         if page:
             docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered])
             markup = TPL_PAGE.format(content=docs)
diff --git a/website/docs/api/displacy.jade b/website/docs/api/displacy.jade
index a14671b4a..a5352ade8 100644
--- a/website/docs/api/displacy.jade
+++ b/website/docs/api/displacy.jade
@@ -54,6 +54,15 @@ p
         +cell #[+a("#options") Visualizer-specific options], e.g. colors.
         +cell #[code {}]
 
+    +row
+        +cell #[code manual]
+        +cell bool
+        +cell
+            |  Don't parse #[code Doc] and instead, expect a dict or list of
+            |  dicts. #[+a("/docs/usage/visualizers#manual-usage") See here]
+            |  for formats and examples.
+        +cell #[code False]
+
     +row
         +cell #[code port]
         +cell int
@@ -111,6 +120,15 @@ p Render a dependency parse tree or named entity visualization.
         +cell #[+a("#options") Visualizer-specific options], e.g. colors.
         +cell #[code {}]
 
+    +row
+        +cell #[code manual]
+        +cell bool
+        +cell
+            |  Don't parse #[code Doc] and instead, expect a dict or list of
+            |  dicts. #[+a("/docs/usage/visualizers#manual-usage") See here]
+            |  for formats and examples.
+        +cell #[code False]
+
     +footrow
         +cell returns
         +cell unicode
diff --git a/website/docs/usage/visualizers.jade b/website/docs/usage/visualizers.jade
index ea675e70c..93a4b5567 100644
--- a/website/docs/usage/visualizers.jade
+++ b/website/docs/usage/visualizers.jade
@@ -287,24 +287,17 @@ p
     |  #[+a("http://www.nltk.org") NLTK] or
     |  #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet].
     |  Simply convert the dependency parse or recognised entities to displaCy's
-    |  format and import #[code DependencyRenderer] or #[code EntityRenderer]
-    |  from #[code spacy.displacy.render]. A renderer class can be is initialised
-    |  with a dictionary of options. To generate the visualization markup, call
-    |  the renderer's #[code render()] method on a list of dictionaries (one
-    |  per visualization).
-
+    |  format and set #[code manual=True] on either #[code render()] or
+    |  #[code serve()].
 
 +aside-code("Example").
-    from spacy.displacy.render import EntityRenderer
-
     ex = [{'text': 'But Google is starting from behind.',
            'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}],
            'title': None}]
-    renderer = EntityRenderer()
-    html = renderer.render(ex)
+    html = displacy.render(ex, style='ent', manual=True)
 
-+code("DependencyRenderer input").
-    [{
++code("DEP input").
+    {
         'words': [
             {'text': 'This', 'tag': 'DT'},
             {'text': 'is', 'tag': 'VBZ'},
@@ -314,11 +307,10 @@ p
             {'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'},
             {'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'},
             {'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}]
-    }]
+    }
 
-+code("EntityRenderer input").
-    [{
++code("ENT input").
+    {
         'text': 'But Google is starting from behind.',
         'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}],
         'title': None
-    }]

From 701cba1524ed9aaf501d979a461358dcb25d22a6 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 22 May 2017 18:53:14 +0200
Subject: [PATCH 2/3] Update models documentation with notes

---
 website/docs/usage/models.jade | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade
index eb63cd0bb..2dec5197e 100644
--- a/website/docs/usage/models.jade
+++ b/website/docs/usage/models.jade
@@ -101,6 +101,9 @@ p
     |  directory. You can then use #[code spacy.load()] to load it via its
     |  package name, create a #[+a("#usage-link") shortcut link] to assign it a
     |  custom name, or #[+a("usage-import") import it] explicitly as a module.
+    |  If you need to download models as part of an automated process, we
+    |  recommend using pip with a direct link, instead of relying on spaCy's
+    |  #[+api("cli#download") #[code download]] command.
 
 +h(3, "download-manual") Manual download and installation
 
@@ -162,6 +165,14 @@ p
     |  The #[+api("cli#link") #[code link]] command will create a symlink
     |  in the #[code spacy/data] directory.
 
++aside("Why does spaCy use symlinks?")
+    |  Symlinks were originally introduced to maintain backwards compatibility,
+    |  as older versions expected model data to live within #[code spacy/data].
+    |  However, we decided to keep using them in v2.0 instead of opting for
+    |  a config file. There'll always be a need for assigning and saving custom
+    |  model names or IDs. And your system already comes with a native solution
+    |  to mapping unicode aliases to file paths: symbolic links.
+
 +code(false, "bash").
     python -m spacy link [package name or path] [shortcut] [--force]
 
@@ -179,7 +190,7 @@ p
     python -m spacy link /Users/you/model my_amazing_model
 
 +infobox("Important note")
-    |  In order to create a symlink, your user needs the required permissions.
+    |  In order to create a symlink, your user needs the #[strong required permissions].
     |  If you've installed spaCy to a system directory and don't have admin
     |  privileges, the #[code spacy link] command may fail. The easiest solution
     |  is to re-run the command as admin, or use a #[code virtualenv]. For more
@@ -189,16 +200,26 @@ p
 +h(3, "usage-import") Importing models as modules
 
 p
-    |  If you've installed a model via pip, you can also #[code import] it
-    |  directly and then call its #[code load()] method with no arguments:
+    |  If you've installed a model via spaCy's downloader, or directly via pip,
+    |  you can also #[code import] it and then call its #[code load()] method
+    |  with no arguments:
 
 +code.
-    import spacy
     import en_core_web_md
 
     nlp = en_core_web_md.load()
     doc = nlp(u'This is a sentence.')
 
+p
+    |  How you choose to load your models ultimately depends on personal
+    |  preference. However, #[strong for larger code bases], we usually recommend
+    |  native imports, as this will make it easier to integrate models with your
+    |  existing build process, continuous integration workflow and testing
+    |  framework. It'll also prevent you from ever trying to load a model that
+    |  is not installed, as your code will raise an #[code ImportError]
+    |  immediately, instead of failing somewhere down the line when calling
+    |  #[code spacy.load()].
+
 +h(2, "own-models") Using your own models
 
 p

From 4cd26bcb83ab0abebb6c85a2f4812909753d5eae Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 22 May 2017 19:04:02 +0200
Subject: [PATCH 3/3] Update docs on rule-based matching and add examples

---
 website/docs/usage/rule-based-matching.jade | 150 +++++++++++++++++---
 1 file changed, 129 insertions(+), 21 deletions(-)

diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade
index ef26f69b6..ae9e4d086 100644
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/docs/usage/rule-based-matching.jade
@@ -20,13 +20,13 @@ p
 
 +list("numbers")
     +item
-        |  A token whose #[strong lower-case form matches "hello"], e.g. "Hello"
+        |  A token whose #[strong lowercase form matches "hello"], e.g. "Hello"
         |  or "HELLO".
     +item
         |  A token whose #[strong #[code is_punct] flag is set to #[code True]],
         |  i.e. any punctuation.
     +item
-        |  A token whose #[strong lower-case form matches "world"], e.g. "World"
+        |  A token whose #[strong lowercase form matches "world"], e.g. "World"
         |  or "WORLD".
 
 +code.
@@ -95,10 +95,6 @@ p
     nlp = spacy.load('en')
     matcher = Matcher(nlp.vocab)
 
-    matcher.add('GoogleIO', on_match=add_event_ent,
-                [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}],
-                [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}, {'IS_DIGIT': True}])
-
     # Get the ID of the 'EVENT' entity type. This is required to set an entity.
     EVENT = nlp.vocab.strings['EVENT']
 
@@ -108,6 +104,10 @@ p
         match_id, start, end = matches[i]
         doc.ents += ((EVENT, start, end),)
 
+    matcher.add('GoogleIO', on_match=add_event_ent,
+                [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}],
+                [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}, {'IS_DIGIT': True}])
+
 p
     |  In addition to mentions of "Google I/O", your data also contains some
     |  annoying pre-processing artefacts, like leftover HTML line breaks
@@ -117,10 +117,6 @@ p
     |  function #[code merge_and_flag]:
 
 +code.
-    matcher.add('BAD_HTML', on_match=merge_and_flag,
-                [{'ORTH': '&lt;'}, {'LOWER': 'br'}, {'ORTH': '&gt;'}],
-                [{'ORTH': '&lt;'}, {'LOWER': 'br/'}, {'ORTH': '&gt;'}])
-
     # Add a new custom flag to the vocab, which is always False by default.
     # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
     BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False)
@@ -131,6 +127,10 @@ p
         span.merge(is_stop=True) # merge (and mark it as a stop word, just in case)
         span.set_flag(BAD_HTML_FLAG, True) # set BAD_HTML_FLAG
 
+    matcher.add('BAD_HTML', on_match=merge_and_flag,
+                [{'ORTH': '&lt;'}, {'LOWER': 'br'}, {'ORTH': '&gt;'}],
+                [{'ORTH': '&lt;'}, {'LOWER': 'br/'}, {'ORTH': '&gt;'}])
+
 +aside("Tip: Visualizing matches")
     |  When working with entities, you can use #[+api("displacy") displaCy]
     |  to quickly generate a NER visualization from your updated #[code Doc],
@@ -146,18 +146,16 @@ p
 
 p
     |  We can now call the matcher on our documents. The patterns will be
-    |  matched in the order they occur in the text.
+    |  matched in the order they occur in the text. The matcher will then
+    |  iterate over the matches, look up the callback for the match ID
+    |  that was matched, and invoke it.
 
 +code.
     doc = nlp(LOTS_OF_TEXT)
     matcher(doc)
 
-+h(3, "on_match-callback") The callback function
-
 p
-    |  The matcher will first collect all matches over the document. It will
-    |  then iterate over the matches, lookup the callback for the entity ID
-    |  that was matched, and invoke it. When the callback is invoked, it is
+    |  When the callback is invoked, it is
     |  passed four arguments: the matcher itself, the document, the position of
     |  the current match, and the total list of matches. This allows you to
     |  write callbacks that consider the entire set of matched phrases, so that
@@ -185,11 +183,24 @@ p
         +cell
             |  A list of #[code (match_id, start, end)] tuples, describing the
             |  matches. A match tuple describes a span #[code doc[start:end]].
-            |  The #[code match_id] is the ID of the added match pattern.
 
-+h(2, "quantifiers") Using quantifiers
++h(2, "quantifiers") Using operators and quantifiers
 
-+table([ "Name", "Description", "Example"])
+p
+    |  The matcher also lets you use quantifiers, specified as the #[code 'OP']
+    |  key. Quantifiers let you define sequences of tokens to be mached, e.g.
+    |  one or more punctuation marks, or specify optional tokens. Note that there
+    |  are no nested or scoped quantifiers – instead, you can build those
+    |  behaviours with #[code on_match] callbacks.
+
++aside("Problems with quantifiers")
+    |  Using quantifiers may lead to unexpected results when matching
+    |  variable-length patterns, for example if the next token would also be
+    |  matched by the previous token. This problem should be resolved in a future
+    |  release. For more information, see
+    |  #[+a(gh("spaCy") + "/issues/864") this issue].
+
++table([ "OP", "Description", "Example"])
     +row
         +cell #[code !]
         +cell match exactly 0 times
@@ -210,6 +221,103 @@ p
         +cell match 0 or 1 times
         +cell optional, max one
 
++h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations
+
 p
-    |  There are no nested or scoped quantifiers. You can build those
-    |  behaviours with #[code on_match] callbacks.
+    |  Let's say you're analysing user comments and you want to find out what
+    |  people are saying about Facebook. You want to start off by finding
+    |  adjectives following "Facebook is" or "Facebook was". This is obviously
+    |  a very rudimentary solution, but it'll be fast, and a great way get an
+    |  idea for what's in your data. Your pattern could look like this:
+
++code.
+    [{'LOWER': 'facebook'}, {'LEMMA': 'be'}, {'POS': 'ADV', 'OP': '*'}, {'POS': 'ADJ'}]
+
+p
+    |  This translates to a token whose lowercase form matches "facebook"
+    |  (like Facebook, facebook or FACEBOOK), followed by a token with the lemma
+    |  "be" (for example, is, was, or 's), followed by an #[strong optional] adverb,
+    |  followed by an adjective. Using the linguistic annotations here is
+    |  especially useful, because you can tell spaCy to match "Facebook's
+    |  annoying", but #[strong not] "Facebook's annoying ads". The optional
+    |  adverb makes sure you won't miss adjectives with intensifiers, like
+    |  "pretty awful" or "very nice".
+
+p
+    |  To get a quick overview of the results, you could collect all sentences
+    |  containing a match and render them with the
+    |  #[+a("/docs/usage/visualizers") displaCy visualizer].
+    |  In the callback function, you'll have access to the #[code start] and
+    |  #[code end] of each match, as well as the parent #[code Doc]. This lets
+    |  you determine the sentence containing the match,
+    |  #[code doc[start : end].sent], and calculate the start and end of the
+    |  matched span within the sentence. Using displaCy in
+    |  #[+a("/docs/usage/visualizers#manual-usage") "manual" mode] lets you
+    |  pass in a list of dictionaries containing the text and entities to render.
+
++code.
+    from spacy import displacy
+    from spacy.matcher import Matcher
+
+    nlp = spacy.load('en')
+    matcher = Matcher(nlp.vocab)
+    matched_sents = [] # collect data of matched sentences to be visualized
+
+    def collect_sents(matcher, doc, i, matches):
+        match_id, start, end = matches[i]
+        span = doc[start : end] # matched span
+        sent = span.sent # sentence containing matched span
+        # append mock entity for match in displaCy style to matched_sents
+        # get the match span by ofsetting the start and end of the span with the
+        # start and end of the sentence in the doc
+        match_ents = [{'start': span.start-sent.start, 'end': span.end-sent.start,
+                       'label': 'MATCH'}]
+        matched_sents.append({'text': sent.text, 'ents': match_ents })
+
+    pattern = [{'LOWER': 'facebook'}, {'LEMMA': 'be'}, {'POS': 'ADV', 'OP': '*'},
+               {'POS': 'ADJ'}]
+    matcher.add('FacebookIs', collect_sents, pattern) # add pattern
+    matches = matcher(nlp(LOTS_OF_TEXT)) # match on your text
+
+    # serve visualization of sentences containing match with displaCy
+    # set manual=True to make displaCy render straight from a dictionary
+    displacy.serve(matched_sents, style='ent', manual=True)
+
+
++h(3, "quantifiers-example2") Quantifiers example: Phone numbers
+
+p
+    |  Phone numbers can have many different formats and matching them is often
+    |  tricky. During tokenization, spaCy will leave sequences of numbers intact
+    |  and only split on whitespace and punctuation. This means that your match
+    |  pattern will have to look out for number sequences of a certain length,
+    |  surrounded by specific punctuation – depending on the
+    |  #[+a("https://en.wikipedia.org/wiki/National_conventions_for_writing_telephone_numbers") national conventions].
+
+p
+    |  The #[code IS_DIGIT] flag is not very helpful here, because it doesn't
+    |  tell us anything about the length. However, you can use the #[code SHAPE]
+    |  flag, with each #[code d] representing a digit:
+
++code.
+    [{'ORTH': '('}, {'SHAPE': 'ddd'}, {'ORTH': ')'}, {'SHAPE': 'dddd'},
+     {'ORTH': '-', 'OP': '?'}, {'SHAPE': 'dddd'}]
+
+p
+    |  This will match phone numbers of the format #[strong (123) 4567 8901] or
+    |  #[strong (123) 4567-8901]. To also match formats like #[strong (123) 456 789],
+    |  you can add a second pattern using #[code 'ddd'] in place of #[code 'dddd'].
+    |  By hard-coding some values, you can match only certain, country-specific
+    |  numbers. For example, here's a pattern to match the most common formats of
+    |  #[+a("https://en.wikipedia.org/wiki/National_conventions_for_writing_telephone_numbers#Germany") international German numbers]:
+
++code.
+    [{'ORTH': '+'}, {'ORTH': '49'}, {'ORTH': '(', 'OP': '?'}, {'SHAPE': 'dddd'},
+     {'ORTH': ')', 'OP': '?'}, {'SHAPE': 'dddddd'}]
+
+p
+    |  Depending on the formats your application needs to match, creating an
+    |  extensive set of rules like this is often better than training a model.
+    |  It'll produce more predictable results, is much easier to modify and
+    |  extend, and doesn't require any training data – only a set of
+    |  test cases.