Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2026-03-07 05:11:27 +03:00 · 2017-05-20 13:55:12 +02:00 · 2017-05-20 13:55:12 +02:00 · a93276bb78
commit a93276bb78
parent ce9234f593 8b14476253
13 changed files with 165 additions and 56 deletions
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -41,7 +41,7 @@ mixin src(url)
    path - [string] path to API docs page relative to /docs/api/

 mixin api(path)
-    +a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block
+    +a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap
        block

        |  #[+icon("book", 18).o-icon--inline.u-color-theme]
--- a/website/assets/css/_variables.sass
+++ b/website/assets/css/_variables.sass
@ -26,7 +26,7 @@ $font-code: 'Source Code Pro', Consolas, 'Andale Mono', Menlo, Monaco, Courier,

 // Colors

-$colors: ( blue: #09a3d5, red: #d9515d )
+$colors: ( blue: #09a3d5, red: #d9515d, green: #08c35e )

 $color-back: #fff !default
 $color-front: #1a1e23 !default
--- a/website/assets/css/style_green.sass
+++ b/website/assets/css/style_green.sass
@ -0,0 +1,4 @@
+//- 💫 STYLESHEET (GREEN)
+
+$theme: green
+@import style
--- a/website/assets/img/pattern_green.jpg
+++ b/website/assets/img/pattern_green.jpg
--- a/website/docs/api/_data.json
+++ b/website/docs/api/_data.json
@ -2,8 +2,13 @@
    "sidebar": {
        "Introduction": {
            "Facts & Figures": "./",
-            "Languages": "language-models",
-            "Philosophy": "philosophy"
+            "Languages": "language-models"
+        },
+        "Top-level": {
+            "spacy": "spacy",
+            "displacy": "displacy",
+            "Utility Functions": "util",
+            "Command line": "cli"
        },
        "Classes": {
            "Doc": "doc",
@ -21,9 +26,6 @@
            "GoldParse": "goldparse"
        },
        "Other": {
-            "Command line": "cli",
-            "displaCy": "displacy",
-            "Utility Functions": "util",
            "Annotation Specs": "annotation",
            "Feature Scheme": "features"
        }
@ -43,6 +45,26 @@
        "title": "Philosophy"
    },

+    "spacy": {
+        "title": "spaCy top-level functions",
+        "next": "displacy"
+    },
+
+    "displacy": {
+        "title": "displaCy",
+        "tag": "module",
+        "next": "util"
+    },
+
+    "util": {
+        "title": "Utility Functions",
+        "next": "cli"
+    },
+
+    "cli": {
+        "title": "Command Line Interface"
+    },
+
    "language": {
        "title": "Language",
        "tag": "class"
@ -113,20 +135,6 @@
        "tag": "class"
    },

-    "cli": {
-        "title": "Command Line Interface",
-        "next": "displacy"
-    },
-
-    "displacy": {
-        "title": "displaCy",
-        "tag": "module"
-    },
-
-    "util": {
-        "title": "Utility Functions"
-    },
-
    "annotation": {
        "title": "Annotation Specifications"
    },
--- a/website/docs/api/cli.jade
+++ b/website/docs/api/cli.jade
@ -92,7 +92,7 @@ p
    +row
        +cell #[code model]
        +cell positional
-        +cell Shortcut link of model (optional).
+        +cell A model, i.e. shortcut link, package name or path (optional).

    +row
        +cell #[code --markdown], #[code -md]
--- a/website/docs/api/displacy.jade
+++ b/website/docs/api/displacy.jade
@ -8,7 +8,7 @@ p
    |  #[+a("/docs/usage/visualizers") visualizing spaCy].


-+h(2, "serve") serve
+h(2, "serve") displacy.serve
    +tag method

 p
@ -60,7 +60,7 @@ p
        +cell Port to serve visualization.
        +cell #[code 5000]

-+h(2, "render") render
+h(2, "render") displacy.render
    +tag method

 p Render a dependency parse tree or named entity visualization.
--- a/website/docs/api/language.jade
+++ b/website/docs/api/language.jade
@ -63,9 +63,8 @@ p
    |  is preserved.

 +aside-code("Example").
-    tokens = nlp('An example sentence. Another example sentence.')
-    tokens[0].text, tokens[0].head.tag_
-    # ('An', 'NN')
+    doc = nlp(u'An example sentence. Another sentence.')
+    assert (doc[0].text, doc[0].head.tag_) == ('An', 'NN')

 +table(["Name", "Type", "Description"])
    +row
--- a/website/docs/api/matcher.jade
+++ b/website/docs/api/matcher.jade
@ -129,8 +129,8 @@ p
        print('Matched!', matches)

    matcher = Matcher(nlp.vocab)
-    matcher.add('HelloWorld', [{LOWER: "hello"}, {LOWER: "world"}], on_match=on_match)
-    matcher.add('GoogleMaps', [{ORTH: "Google"}, {ORTH: "Maps"}], on_match=on_match)
+    matcher.add('HelloWorld', on_match, [{LOWER: "hello"}, {LOWER: "world"}])
+    matcher.add('GoogleMaps', on_match, [{ORTH: "Google"}, {ORTH: "Maps"}])

    doc = nlp(u'HELLO WORLD on Google Maps.')
    matches = matcher(doc)
@ -141,16 +141,16 @@ p
        +cell unicode
        +cell An ID for the thing you're matching.

-    +row
-        +cell #[code *patterns]
-        +cell list
-        +cell
-            |  Match pattern. A pattern consists of a list of dicts, where each
-            |  dict describes a token.
-
    +row
        +cell #[code on_match]
        +cell function
        +cell
            |  Callback function to act on matches. Takes the arguments
            |  #[code matcher], #[code doc], #[code i] and #[code matches].
+
+    +row
+        +cell #[code *patterns]
+        +cell list
+        +cell
+            |  Match pattern. A pattern consists of a list of dicts, where each
+            |  dict describes a token.
--- a/website/docs/api/spacy.jade
+++ b/website/docs/api/spacy.jade
@ -0,0 +1,93 @@
+//- 💫 DOCS > API > SPACY
+
+include ../../_includes/_mixins
+
+h(2, "load") spacy.load
+    +tag function
+
+p
+    |  Load a model via its #[+a("/docs/usage/models#usage") shortcut link],
+    |  the name of an installed
+    |  #[+a("/docs/usage/saving-loading#generating") model package], a unicode
+    |  path or a #[code Path]-like object. spaCy will try resolving the load
+    |  argument in this order. The #[code Language] class to initialise will be
+    |  determined based on the model's settings.
+
+aside-code("Example").
+    nlp = spacy.load('en') # shortcut link
+    nlp = spacy.load('en_core_web_sm') # package
+    nlp = spacy.load('/path/to/en') # unicode path
+    nlp = spacy.load(Path('/path/to/en')) # pathlib Path
+
+infobox("⚠️ Deprecation note")
+    |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
+    |  will also raise an error if no model could be loaded and never just
+    |  return an empty #[code Language] object. If you need a blank language,
+    |  you need to import it explicitly: #[code from spacy.lang.en import English].
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code name]
+        +cell unicode or #[code Path]
+        +cell Model to load, i.e. shortcut link, package name or path.
+
+    +footrow
+        +cell returns
+        +cell #[code Language]
+        +cell A #[code Language] object with the loaded model.
+
+h(2, "info") spacy.info
+    +tag function
+
+p
+    |  The same as the #[+api("cli#info") #[code info] command]. Pretty-print
+    |  information about your installation, models and local setup from within
+    |  spaCy. To get the model meta data as a dictionary instead, you can
+    |  use the #[code meta] attribute on your #[code nlp] object with a
+    |  loaded model, e.g. #[code nlp['meta']].
+
+aside-code("Example").
+    spacy.info()
+    spacy.info('en')
+    spacy.info('de', markdown=True)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code model]
+        +cell unicode
+        +cell A model, i.e. shortcut link, package name or path (optional).
+
+    +row
+        +cell #[code markdown]
+        +cell bool
+        +cell Print information as Markdown.
+
+
+h(2, "explain") spacy.explain
+    +tag function
+
+p
+    |  Get a description for a given POS tag, dependency label or entity type.
+    |  For a list of available terms, see
+    |  #[+src(gh("spacy", "spacy/glossary.py")) glossary.py].
+
+aside-code("Example").
+    spacy.explain('NORP')
+    # Nationalities or religious or political groups
+
+    doc = nlp(u'Hello world')
+    for word in doc:
+        print(word.text, word.tag_, spacy.explain(word.tag_))
+    # Hello UH interjection
+    # world NN noun, singular or mass
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code term]
+        +cell unicode
+        +cell Term to explain.
+
+    +footrow
+        +cell returns
+        +cell unicode
+        +cell The explanation, or #[code None] if not found in the glossary.
--- a/website/docs/api/util.jade
+++ b/website/docs/api/util.jade
@ -14,7 +14,7 @@ p
    |  recommend having additional tests in place if your application depends on
    |  any of spaCy's utilities.

-+h(2, "get_data_path") get_data_path
+h(2, "get_data_path") util.get_data_path
    +tag function

 p
@ -32,7 +32,7 @@ p
        +cell #[code Path] / #[code None]
        +cell Data path or #[code None].

-+h(2, "set_data_path") set_data_path
+h(2, "set_data_path") util.set_data_path
    +tag function

 p
@ -49,7 +49,7 @@ p
        +cell unicode or #[code Path]
        +cell Path to new data directory.

-+h(2, "get_lang_class") get_lang_class
+h(2, "get_lang_class") util.get_lang_class
    +tag function

 p
@ -74,7 +74,7 @@ p
        +cell #[code Language]
        +cell Language class.

-+h(2, "resolve_model_path") resolve_model_path
+h(2, "resolve_model_path") util.resolve_model_path
    +tag function

 p Resolve a model name or string to a model path.
@ -94,7 +94,7 @@ p Resolve a model name or string to a model path.
        +cell #[code Path]
        +cell Path to model data directory.

-+h(2, "is_package") is_package
+h(2, "is_package") util.is_package
    +tag function

 p
@ -116,7 +116,7 @@ p
        +cell #[code bool]
        +cell #[code True] if installed package, #[code False] if not.

-+h(2, "get_model_package_path") get_model_package_path
+h(2, "get_model_package_path") util.get_model_package_path
    +tag function

 p
@ -138,7 +138,7 @@ p
        +cell #[code Path]
        +cell Path to model data directory.

-+h(2, "parse_package_meta") parse_package_meta
+h(2, "parse_package_meta") util.parse_package_meta
    +tag function

 p
@ -167,7 +167,7 @@ p
        +cell dict / #[code None]
        +cell Model meta data or #[code None].

-+h(2, "update_exc") update_exc
+h(2, "update_exc") util.update_exc
    +tag function

 p
@ -199,7 +199,7 @@ p
        +cell Combined tokenizer exceptions.


-+h(2, "prints") prints
+h(2, "prints") util.prints
    +tag function

 p
--- a/website/docs/api/vocab.jade
+++ b/website/docs/api/vocab.jade
@ -124,7 +124,7 @@ p
        +cell #[code Lexeme]
        +cell The lexeme indicated by the given ID.

-+h(2, "iter") Span.__iter__
+h(2, "iter") Vocab.__iter__
    +tag method

 p Iterate over the lexemes in the vocabulary.
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/docs/usage/rule-based-matching.jade
@ -36,7 +36,9 @@ p
    |  First, we initialise the #[code Matcher] with a vocab. The matcher must
    |  always share the same vocab with the documents it will operate on. We
    |  can now call #[+api("matcher#add") #[code matcher.add()]] with an ID and
-    |  our custom pattern:
+    |  our custom pattern. The second argument lets you pass in an optional
+    |  callback function to invoke on a successful match. For now, we set it
+    |  to #[code None].

 +code.
    import spacy
@ -45,7 +47,9 @@ p

    nlp = spacy.load('en')
    matcher = Matcher(nlp.vocab)
-    matcher.add('HelloWorld', [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}])
+    # add match ID "HelloWorld" with no callback and one pattern
+    matcher.add('HelloWorld', on_match=None,
+                [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}])

    doc = nlp(u'Hello, world! Hello world!')
    matches = matcher(doc)
@ -58,8 +62,9 @@ p
    |  without punctuation between "hello" and "world":

 +code.
-    matcher.add('HelloWorld', [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
-                              [{LOWER: 'hello'}, {LOWER: 'world'}])
+    matcher.add('HelloWorld', on_match=None,
+                [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
+                [{LOWER: 'hello'}, {LOWER: 'world'}])

 p
    |  By default, the matcher will only return the matches and
@ -92,9 +97,9 @@ p
    nlp = spacy.load('en')
    matcher = Matcher(nlp.vocab)

-    matcher.add('GoogleIO', [{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}],
-                            [{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}, {IS_DIGIT: True}],
-                            on_match=add_event_ent)
+    matcher.add('GoogleIO', on_match=add_event_ent,
+                [{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}],
+                [{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}, {IS_DIGIT: True}])

    # Get the ID of the 'EVENT' entity type. This is required to set an entity.
    EVENT = nlp.vocab.strings['EVENT']
@ -114,9 +119,9 @@ p
    |  function #[code merge_and_flag]:

 +code.
-    matcher.add('BAD_HTML', [{ORTH: '&lt;'}, {LOWER: 'br'}, {ORTH: '&gt;'}],
-                            [{ORTH: '&lt;'}, {LOWER: 'br/'}, {ORTH: '&gt;'}]
-                            on_match=merge_and_flag)
+    matcher.add('BAD_HTML', on_match=merge_and_flag,
+                [{ORTH: '&lt;'}, {LOWER: 'br'}, {ORTH: '&gt;'}],
+                [{ORTH: '&lt;'}, {LOWER: 'br/'}, {ORTH: '&gt;'}])

    # Add a new custom flag to the vocab, which is always False by default.
    # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.