Merge branch 'master' of https://github.com/honnibal/spaCy

2025-04-25 03:13:41 +03:00 · 2015-09-28 01:13:02 +02:00 · 2015-09-28 01:13:02 +02:00 · d1850dcbf7
commit d1850dcbf7
parent 22d1112031 6982ed74ab
11 changed files with 274 additions and 22 deletions
--- a/LICENSE.txt
+++ b/LICENSE.txt
@ -1,18 +1,23 @@
-spaCy is commercial open-source software: you can buy a commercial
-license, or you can use it under the AGPL, as described below.
+The MIT License (MIT)

 spaCy Natural Language Processing Tools
+
 Copyright (C) 2015 Matthew Honnibal

-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU Affero General Public License as
-published by the Free Software Foundation, either version 3 of the
-License, or (at your option) any later version.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:

-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU Affero General Public License for more details.
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.

-You should have received a copy of the GNU Affero General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/README.md
+++ b/README.md
@ -6,7 +6,8 @@ spaCy is a library for advanced natural language processing in Python and Cython
 Documentation and details: http://spacy.io/

 spaCy is built on the very latest research, but it isn't researchware.  It was
-designed from day 1 to be used in real products. You can buy a commercial license, or you can use it under the AGPL.
+designed from day 1 to be used in real products. It's commercial open-source
+software, released under the MIT license.


 Features
--- a/examples/matcher_example.py
+++ b/examples/matcher_example.py
@ -0,0 +1,161 @@
+from __future__ import unicode_literals, print_function
+
+import spacy.en
+import spacy.matcher
+from spacy.attrs import ORTH, TAG, LOWER, IS_ALPHA, FLAG63
+
+import plac
+
+
+def main():
+    nlp = spacy.en.English()
+    example = u"I prefer Siri to Google Now. I'll google now to find out how the google now service works."
+    before = nlp(example)
+    print("Before")
+    for ent in before.ents:
+        print(ent.text, ent.label_, [w.tag_ for w in ent])
+    # Output:
+    # Google ORG [u'NNP']
+    # google ORG [u'VB']
+    # google ORG [u'NNP']
+    nlp.matcher.add(
+        "GoogleNow", # Entity ID: Not really used at the moment.
+        "PRODUCT",   # Entity type: should be one of the types in the NER data
+        {"wiki_en": "Google_Now"}, # Arbitrary attributes. Currently unused.
+        [  # List of patterns that can be Surface Forms of the entity
+
+            # This Surface Form matches "Google Now", verbatim
+            [ # Each Surface Form is a list of Token Specifiers.
+                { # This Token Specifier matches tokens whose orth field is "Google"
+                    ORTH: "Google"
+                },
+                { # This Token Specifier matches tokens whose orth field is "Now"
+                    ORTH: "Now"
+                }
+            ],
+            [ # This Surface Form matches "google now", verbatim, and requires
+              # "google" to have the NNP tag. This helps prevent the pattern from
+              # matching cases like "I will google now to look up the time"
+                {
+                    ORTH: "google",
+                    TAG: "NNP"
+                },
+                {
+                    ORTH: "now"
+                }
+            ]
+        ]
+    )
+    after = nlp(example)
+    print("After")
+    for ent in after.ents:
+        print(ent.text, ent.label_, [w.tag_ for w in ent])
+    # Output
+    # Google Now PRODUCT [u'NNP', u'RB']
+    # google ORG [u'VB']
+    # google now PRODUCT [u'NNP', u'RB']
+    #
+    # You can customize attribute values in the lexicon, and then refer to the
+    # new attributes in your Token Specifiers.
+    # This is particularly good for word-set membership.
+    # 
+    australian_capitals = ['Brisbane', 'Sydney', 'Canberra', 'Melbourne', 'Hobart',
+                           'Darwin', 'Adelaide', 'Perth']
+    # Internally, the tokenizer immediately maps each token to a pointer to a 
+    # LexemeC struct. These structs hold various features, e.g. the integer IDs
+    # of the normalized string forms.
+    # For our purposes, the key attribute is a 64-bit integer, used as a bit field.
+    # spaCy currently only uses 12 of the bits for its built-in features, so
+    # the others are available for use. It's best to use the higher bits, as
+    # future versions of spaCy may add more flags. For instance, we might add
+    # a built-in IS_MONTH flag, taking up FLAG13. So, we bind our user-field to
+    # FLAG63 here.
+    is_australian_capital = FLAG63
+    # Now we need to set the flag value. It's False on all tokens by default,
+    # so we just need to set it to True for the tokens we want.
+    # Here we iterate over the strings, and set it on only the literal matches.
+    for string in australian_capitals:
+        lexeme = nlp.vocab[string]
+        lexeme.set_flag(is_australian_capital, True)
+    print('Sydney', nlp.vocab[u'Sydney'].check_flag(is_australian_capital))
+    print('sydney', nlp.vocab[u'sydney'].check_flag(is_australian_capital))
+    # If we want case-insensitive matching, we have to be a little bit more
+    # round-about, as there's no case-insensitive index to the vocabulary. So
+    # we have to iterate over the vocabulary.
+    # We'll be looking up attribute IDs in this set a lot, so it's good to pre-build it
+    target_ids = {nlp.vocab.strings[s.lower()] for s in australian_capitals}
+    for lexeme in nlp.vocab:
+        if lexeme.lower in target_ids:
+            lexeme.set_flag(is_australian_capital, True)
+    print('Sydney', nlp.vocab[u'Sydney'].check_flag(is_australian_capital))
+    print('sydney', nlp.vocab[u'sydney'].check_flag(is_australian_capital))
+    print('SYDNEY', nlp.vocab[u'SYDNEY'].check_flag(is_australian_capital))
+    # Output
+    # Sydney True
+    # sydney False
+    # Sydney True
+    # sydney True
+    # SYDNEY True
+    #
+    # The key thing to note here is that we're setting these attributes once,
+    # over the vocabulary --- and then reusing them at run-time. This means the
+    # amortized complexity of anything we do this way is going to be O(1). You
+    # can match over expressions that need to have sets with tens of thousands
+    # of values, e.g. "all the street names in Germany", and you'll still have
+    # O(1) complexity. Most regular expression algorithms don't scale well to
+    # this sort of problem.
+    #
+    # Now, let's use this in a pattern
+    nlp.matcher.add("AuCitySportsTeam", "ORG", {},
+        [
+            [
+                {LOWER: "the"},
+                {is_australian_capital: True},
+                {TAG: "NNS"}
+            ],
+            [
+                {LOWER: "the"},
+                {is_australian_capital: True},
+                {TAG: "NNPS"}
+            ],
+            [
+                {LOWER: "the"},
+                {IS_ALPHA: True}, # Allow a word in between, e.g. The Western Sydney
+                {is_australian_capital: True},
+                {TAG: "NNS"}
+            ],
+            [
+                {LOWER: "the"},
+                {IS_ALPHA: True}, # Allow a word in between, e.g. The Western Sydney
+                {is_australian_capital: True},
+                {TAG: "NNPS"}
+            ]
+        ])
+    doc = nlp(u'The pattern should match the Brisbane Broncos and the South Darwin Spiders, but not the Colorado Boulders')
+    for ent in doc.ents:
+        print(ent.text, ent.label_)
+    # Output
+    # the Brisbane Broncos ORG
+    # the South Darwin Spiders ORG
+
+
+# Output
+# Before
+# Google ORG [u'NNP']
+# google ORG [u'VB']
+# google ORG [u'NNP']
+# After
+# Google Now PRODUCT [u'NNP', u'RB']
+# google ORG [u'VB']
+# google now PRODUCT [u'NNP', u'RB']
+# Sydney True
+# sydney False
+# Sydney True
+# sydney True
+# SYDNEY True
+# the Brisbane Broncos ORG
+# the South Darwin Spiders ORG
+
+if __name__ == '__main__':
+    main()
+    
--- a/lang_data/en/morphs.json
+++ b/lang_data/en/morphs.json
@ -42,5 +42,19 @@
        "its":   {"L": "-PRON-", "Person": "Three", "Number": "Sing", "Gender": "Neut", "PronType": "Prs", "Poss": "Yes"},
        "our":   {"L": "-PRON-", "Person": "One",   "Number": "Plur",                   "PronType": "Prs", "Poss": "Yes"},
        "their": {"L": "-PRON-", "Person": "Three", "Number": "Plur",                   "PronType": "Prs", "Poss": "Yes"}
+    },
+
+    "VBZ": {
+        "am":  {"L": "be", "VerbForm": "Fin", "Person": "One",   "Tense": "Pres", "Mood": "Ind"},
+        "are": {"L": "be", "VerbForm": "Fin", "Person": "Two",   "Tense": "Pres", "Mood": "Ind"},
+        "is":  {"L": "be", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", "Mood": "Ind"},
+    },
+    "VBP": {
+        "are":  {"L": "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}
+    },
+    "VBD": {
+        "was":  {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"},
+        "were": {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"}
    }
+
 }
--- a/website/src/jade/blog/index.jade
+++ b/website/src/jade/blog/index.jade
@ -15,6 +15,8 @@ mixin WriteTeaser(Authors, post_title)
        include ./eli5-computers-learn-reading/meta.jade
    else if post_title == "dead-code-should-be-buried"
        include ./dead-code-should-be-buried/meta.jade
+    else if post_title == "spacy-now-mit"
+        include ./spacy-now-mit/meta.jade

    - var Author = Authors[Meta.author_id]
    article.post
@ -31,7 +33,7 @@ include ../header.jade

 +WritePage(Site, Authors.spacy, Page)
    section.intro.profile
-        p A lot of work has gone into #[strong spaCy], but no magic. We plan to keep no secrets. We want you to be able to #[a(href="/license") build your business] on #[strong spaCy] &ndash; so we want you to understand it. Tell us whether you do. #[span.social #[a(href="//twitter.com/" + Site.twitter, target="_blank") Twitter] #[a(href="mailto:contact@spacy.io") Contact us]]
+        p A lot of work has gone into #[strong spaCy], but no magic. We plan to keep no secrets. We want you to be able to #[a(href="/blog/spacy-now-mit") build your business] on #[strong spaCy] &ndash; so we want you to understand it. Tell us whether you do. #[span.social #[a(href="//twitter.com/" + Site.twitter, target="_blank") Twitter] #[a(href="mailto:contact@spacy.io") Contact us]]
            nav(role='navigation')
                ul
                    li #[a.button(href='#blogs') Blog]
@ -40,6 +42,7 @@ include ../header.jade
    h2 #[a.permalink(href='#blogs', name='blogs') Blog]

    section.blogs
+            +WriteTeaser(Authors, 'spacy-now-mit')
            +WriteTeaser(Authors, 'dead-code-should-be-buried')
            +WriteTeaser(Authors, 'eli5-computers-learn-reading')
            +WriteTeaser(Authors, 'displacy')
--- a/website/src/jade/blog/spacy-now-mit/index.jade
+++ b/website/src/jade/blog/spacy-now-mit/index.jade
@ -0,0 +1,56 @@
+include ./meta.jade
+include ../../header.jade
+
+
+WritePost(Meta)
+    //# AGPL not free enough: spaCy now under MIT, offering  adaptation as a service
+
+    p Three big announcements for #[a(href="http://spacy.io") spaCy], a Python library for industrial-strength natural language processing (NLP).
+    
+    ol
+        li The founding team is doubling in size: I'd like to welcome my new co-founder, #[a(href="https://www.linkedin.com/profile/view?id=ADEAAADkZcYBnipeHOAS6HqrDBPK1IzAAVI64ds&authType=NAME_SEARCH&authToken=YYZ1&locale=en_US&srchid=3310922891443387747239&srchindex=1&srchtotal=16&trk=vsrp_people_res_name&trkInfo=VSRPsearchId%3A3310922891443387747239%2CVSRPtargetId%3A14968262%2CVSRPcmpt%3Aprimary%2CVSRPnm%3Atrue%2CauthType%3ANAME_SEARCH") Henning Peters].
+        li spaCy is now available under #[a(href="https://en.wikipedia.org/wiki/MIT_License") the MIT license]. Formerly, spaCy was dual licensed: #[a(href="https://en.wikipedia.org/wiki/Affero_General_Public_License") AGPL], or pay a fee to use it unencumbered.
+        li A new service is entering closed beta: #[em Adaptation]. We want to work with you to deliver custom statistical models, optimized for your task, using your metrics, on your data.
+    
+    h2 The old model: AGPL-or-$
+    
+    p In mid 2014, I quit my day job as an academic, and started writing spaCy. I did this because I saw that companies were trying to use the code I'd been publishing to support my experiments in natural language understanding --- even though that code was never designed to be actually #[em used]. Its mission in life was to print some annotation and exit: to demonstrate some point about how we should design these systems going forward.
+    
+    p My idea for spaCy was simple. I'd write a better library, crafted lovingly for first-rate performance and usability, ensure it had great documentation and a simple install procedure, and offer long-term, business-friendly licenses.
+    
+    p I quickly ruled out an entirely closed source model. Users are valuable, whether or not they submit patches. They find problems and suggest solutions. And there's no better advertising than adoption.
+    
+    p But I did want spaCy to be the product, the thing that I was paid to make great. I wanted a business model that maximised the value of the library. To me, this excluded a SaaS model, since I think using the technology behind an API is an inferior technical approach to having the source code, and running the library locally.
+    
+    p So I settled on a dual license model. Anyone could download and use spaCy under the AGPL. However, most companies have a blanket ban on GPL libraries, since they're usually unwilling to release their own code under the GPL. These companies could instead sign up for a commercial license, which offered them near complete freedom, to use the library and its source however they wanted.
+    
+    p Commercial licenses were available as a free 90 day trial. On release, I offered lifetime licenses for a one-time fee of $5,000. As the library improved, this was repriced to $5,000 a year, or $20,000 for 5 years. I wanted to offer the library at prices that were very low relative to engineering salaries. I felt that spaCy could easily represent many weeks of development time savings per year, over a similar open source library.
+    
+    h2 Why AGPL-or-$ wasn't quite right
+    
+    p While copyleft licenses may be maximally "free" in some philosophical sense, engineers interested in spaCy were not free to simply download and try the library at work. And that's the sort of freedom we're most interested in. You shouldn't have to get management to sign a legal agreement to try out some code you read about on the internet.
+    
+    p Even though the trial was free, and the terms were pretty simple, a commercial license agreement was still a major barrier to adoption. When looking around for a new solution, there are always endless avenues to explore, almost all of which turn out to be dead ends. There's not a lot of room in this process for potential solutions that ask you to do additional leg-work.
+    
+    p Another huge problem is that neither of spaCy's licenses were suitable for most open-source developers. The ecosystem around copyleft licenses such as AGPL is tiny in comparison to the ecosystem around permissive licenses such as MIT. This cut spaCy off from a large community of potential users, making it much less useful than it should be.
+    
+    p I knew when I settled on the AGPL-or-$ idea that it was an unusual model. I expected to face the usual novelty problems: I'd have more explaining to do, perceptions might be unfavorable etc. Instead I think the novelty made this model intrinsically worse. It doesn't integrate well into the rest of the ecosystem.
+    
+    h2 spaCy now MIT licensed
+    
+    p spaCy is now available under the MIT license. Essentially, everyone now gets a free version of what used to be the commercial license (but in a standard form, that you don't have to bug management and legal to okay).
+
+    
+    p Anyone can now use spaCy in closed-source applications, however you like, without paying any license fees.
+    
+    p Any open-source libraries that want to build on spaCy, can.
+    
+    h2 Adaptation as a service
+    
+    p spaCy provides a suite of general-purpose natural language understanding components. In development, we measure and optimize the accuracy of these components against manually labelled data. But these annotations are a means to an end. They're only useful when you make use of them &ndsh; when you put them to work in your product. So that's how we want to define success. We want to optimize spaCy for the metrics you care about, and we only want to be paid if we can improve them.
+
+    p There are lots of ways we can deliver an improvement. The simplest is traditional training and consulting, which is particularly effective for NLP since it's such a deep and narrow niche. There are also a set of general, reuseable strategies for making spaCy work better on your data. Instead of the general purpose statistical model, you could get a model optimized specifically for your use case.
+    
+    p The details of all of this will vary, on a case-by-case basis. It will often be useful to gather a variety of statistics about how spaCy performs on your text, and we might spend time improving them. But these accuracy statistics are not the bottom-line. The numbers that really matter are the ones that get you paid. That's the needle we want to move.
+
+    p To apply for the closed beta, #[a(href="mailto:contact@spacy.io") send us an email] explaining what you're doing, and how you evaluate success.
--- a/website/src/jade/blog/spacy-now-mit/meta.jade
+++ b/website/src/jade/blog/spacy-now-mit/meta.jade
@ -0,0 +1,8 @@
+- var Meta = {}
+- Meta.author_id = "matt"
+- Meta.headline = "AGPL Not Free Enough: Switching to MIT"
+- Meta.description = "Three big announcements: we're changing license, to MIT from AGPL; a new co-founder is coming on board, Henning Peters; and we're launching a new service, to adapt spaCy's statistical models to your task."
+- Meta.date = "2015-09-28"
+- Meta.url = "/blog/spacy-now-mit"
+- Meta.links = []
+
--- a/website/src/jade/header.jade
+++ b/website/src/jade/header.jade
@ -115,7 +115,7 @@ mixin WritePage(Site, Author, Page)
                li(class={active: Page.active.home}): a(href="/") Home
                li(class={active: Page.active.docs}): a(href="/docs") Docs
                li: a(href="/displacy", target="_blank") Demo
-                li(class={active: Page.active.license}): a(href="/license") License
+                //li(class={active: Page.active.license}): a(href="/license") License
                li(class={active: Page.active.blog}): a(href="/blog") Blog
            main#content
                block
--- a/website/src/jade/home/_installation.jade
+++ b/website/src/jade/home/_installation.jade
@ -90,7 +90,7 @@ h4 What's New?

 details
    summary
-        h4 2015-09-21 v0.92: Bug fixes to word vectors. Rename .repvec to .vector. Rename .string attribute.
+        h4 2015-09-21 v0.93: Bug fixes to word vectors. Rename .repvec to .vector. Rename .string attribute.

    ul
        li Bug fixes for word vectors.
--- a/website/src/jade/home/index.jade
+++ b/website/src/jade/home/index.jade
@ -29,10 +29,10 @@ include ../header.jade
        li: a.button(href="#example-use") Examples
        li: a.button(href="#install")
          | Install
-          <span class="button-caption">v0.92</span>
+          <span class="button-caption">v0.93</span>

  article.page.landing-page
    +Section("Comparisons and Benchmarks", "comparisons", "./_comparisons.jade")
    +Section("Online Demo", "online-demo", "./_online_demo.jade")
    +Section("Usage by Example", "example-use", "./_usage_examples.jade")
-    +Section("Install v0.92", "install", "./_installation.jade")
+    +Section("Install v0.93", "install", "./_installation.jade")
--- a/website/src/jade/tutorials/load-new-word-vectors/index.jade
+++ b/website/src/jade/tutorials/load-new-word-vectors/index.jade
@ -1,5 +1,5 @@
 include ./meta.jade
-include ../header.jade
+include ../../header.jade

 +WritePost(Meta)

@ -12,9 +12,9 @@ include ../header.jade

    pre
        code
-            word_key1 0.92 0.45 -0.9 0.0
-            word_key2 0.3 0.1 0.6 0.3
-            ...
+            | word_key1 0.92 0.45 -0.9 0.0
+            | word_key2 0.3 0.1 0.6 0.3
+            | ...

    p That is, each line is a single entry. Each entry consists of a key string, followed by a sequence of floats. Each entry should have the same number of floats.

@ -69,3 +69,7 @@ include ../header.jade
    p All tokens which have the #[code orth] attribute #[em apples] will inherit the updated vector.

    p Note that the updated vectors won't persist after exit, unless you persist them yourself, and then replace the #[code vec.bin] file as described above.
+
+    p A popular source of word vectors are the #[a(href="http://nlp.stanford.edu/projects/glove/") GloVe word vectors], particularly those calculated off the #[a(href="https://commoncrawl.org/") Common Crawl]. Note that the provided vector file has a few entries which are not valid UTF8 strings. These should be filtered out.
+
+    p Future versions of spaCy will allow you to provide a file-like object, instead of a location of a #[bz2] file.