From bf92625edead3707038fbd00e0b249ebe1f04855 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 26 Apr 2019 13:19:50 +0200
Subject: [PATCH] Update from master

---
 .github/contributors/bjascob.md  | 106 ++++++++++++++++++++++++++++
 spacy/cli/evaluate.py            |   2 +-
 spacy/cli/pretrain.py            |  43 +++++++-----
 spacy/cli/train.py               |  45 +++++++-----
 spacy/displacy/__init__.py       |   7 +-
 spacy/lang/th/__init__.py        |  10 ++-
 spacy/lang/th/norm_exceptions.py | 114 +++++++++++++++++++++++++++++++
 website/meta/universe.json       |  22 ++++++
 8 files changed, 310 insertions(+), 39 deletions(-)
 create mode 100644 .github/contributors/bjascob.md
 create mode 100644 spacy/lang/th/norm_exceptions.py

diff --git a/.github/contributors/bjascob.md b/.github/contributors/bjascob.md
new file mode 100644
index 000000000..4870c494a
--- /dev/null
+++ b/.github/contributors/bjascob.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Brad Jascob          |
+| Company name (if applicable)   | n/a                  |
+| Title or role (if applicable)  | Software Engineer    |
+| Date                           | 04/25/2019           |
+| GitHub username                | bjascob              |
+| Website (optional)             | n/a                  |
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index df391d730..468698e2f 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -17,7 +17,7 @@ from .. import displacy
     gpu_id=("Use GPU", "option", "g", int),
     displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
     displacy_limit=("Limit of parses to render as HTML", "option", "dl", int),
-    return_scores=("Return dict containing model scores", "flag", "r", bool),
+    return_scores=("Return dict containing model scores", "flag", "R", bool),
 )
 def evaluate(
     model,
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 0b316b47c..ef91937a6 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -34,7 +34,8 @@ from .. import util
     max_length=("Max words per example.", "option", "xw", int),
     min_length=("Min words per example.", "option", "nw", int),
     seed=("Seed for random number generators", "option", "s", float),
-    nr_iter=("Number of iterations to pretrain", "option", "i", int),
+    n_iter=("Number of iterations to pretrain", "option", "i", int),
+    n_save_every=("Save model every X batches.", "option", "se", int),
 )
 def pretrain(
     texts_loc,
@@ -46,11 +47,12 @@ def pretrain(
     loss_func="cosine",
     use_vectors=False,
     dropout=0.2,
-    nr_iter=1000,
+    n_iter=1000,
     batch_size=3000,
     max_length=500,
     min_length=5,
     seed=0,
+    n_save_every=None,
 ):
     """
     Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
@@ -115,9 +117,26 @@ def pretrain(
     msg.divider("Pre-training tok2vec layer")
     row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
     msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
-    for epoch in range(nr_iter):
-        for batch in util.minibatch_by_words(
-            ((text, None) for text in texts), size=batch_size
+
+    def _save_model(epoch, is_temp=False):
+        is_temp_str = ".temp" if is_temp else ""
+        with model.use_params(optimizer.averages):
+            with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open(
+                "wb"
+            ) as file_:
+                file_.write(model.tok2vec.to_bytes())
+            log = {
+                "nr_word": tracker.nr_word,
+                "loss": tracker.loss,
+                "epoch_loss": tracker.epoch_loss,
+                "epoch": epoch,
+            }
+            with (output_dir / "log.jsonl").open("a") as file_:
+                file_.write(srsly.json_dumps(log) + "\n")
+
+    for epoch in range(n_iter):
+        for batch_id, batch in enumerate(
+            util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
         ):
             docs = make_docs(
                 nlp,
@@ -133,17 +152,9 @@ def pretrain(
                 msg.row(progress, **row_settings)
                 if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
                     break
-        with model.use_params(optimizer.averages):
-            with (output_dir / ("model%d.bin" % epoch)).open("wb") as file_:
-                file_.write(model.tok2vec.to_bytes())
-            log = {
-                "nr_word": tracker.nr_word,
-                "loss": tracker.loss,
-                "epoch_loss": tracker.epoch_loss,
-                "epoch": epoch,
-            }
-            with (output_dir / "log.jsonl").open("a") as file_:
-                file_.write(srsly.json_dumps(log) + "\n")
+            if n_save_every and (batch_id % n_save_every == 0):
+                _save_model(epoch, is_temp=True)
+        _save_model(epoch)
         tracker.epoch_loss = 0.0
         if texts_loc != "-":
             # Reshuffle the texts if texts were loaded from a file
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 5cf0f5f6f..63c6242de 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -35,7 +35,12 @@ from .. import about
     pipeline=("Comma-separated names of pipeline components", "option", "p", str),
     vectors=("Model to load vectors from", "option", "v", str),
     n_iter=("Number of iterations", "option", "n", int),
-    early_stopping_iter=("Maximum number of training epochs without dev accuracy improvement", "option", "e", int),
+    n_early_stopping=(
+        "Maximum number of training epochs without dev accuracy improvement",
+        "option",
+        "ne",
+        int,
+    ),
     n_examples=("Number of examples", "option", "ns", int),
     use_gpu=("Use GPU", "option", "g", int),
     version=("Model version", "option", "V", str),
@@ -75,7 +80,7 @@ def train(
     pipeline="tagger,parser,ner",
     vectors=None,
     n_iter=30,
-    early_stopping_iter=None,
+    n_early_stopping=None,
     n_examples=0,
     use_gpu=-1,
     version="0.0.0",
@@ -226,7 +231,7 @@ def train(
     msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
     try:
         iter_since_best = 0
-        best_score = 0.
+        best_score = 0.0
         for i in range(n_iter):
             train_docs = corpus.train_docs(
                 nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
@@ -335,17 +340,23 @@ def train(
                         gpu_wps=gpu_wps,
                     )
                     msg.row(progress, **row_settings)
-                # early stopping
-                if early_stopping_iter is not None:
+                # Early stopping
+                if n_early_stopping is not None:
                     current_score = _score_for_model(meta)
                     if current_score < best_score:
                         iter_since_best += 1
                     else:
                         iter_since_best = 0
                         best_score = current_score
-                    if iter_since_best >= early_stopping_iter:
-                        msg.text("Early stopping, best iteration is: {}".format(i-iter_since_best))
-                        msg.text("Best score = {}; Final iteration score = {}".format(best_score, current_score))
+                    if iter_since_best >= n_early_stopping:
+                        msg.text(
+                            "Early stopping, best iteration "
+                            "is: {}".format(i - iter_since_best)
+                        )
+                        msg.text(
+                            "Best score = {}; Final iteration "
+                            "score = {}".format(best_score, current_score)
+                        )
                         break
     finally:
         with nlp.use_params(optimizer.averages):
@@ -356,19 +367,21 @@ def train(
             best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names)
         msg.good("Created best model", best_model_path)
 
+
 def _score_for_model(meta):
     """ Returns mean score between tasks in pipeline that can be used for early stopping. """
     mean_acc = list()
-    pipes = meta['pipeline']
-    acc = meta['accuracy']
-    if 'tagger' in pipes:
-        mean_acc.append(acc['tags_acc'])
-    if 'parser' in pipes:
-        mean_acc.append((acc['uas']+acc['las']) / 2)
-    if 'ner' in pipes:
-        mean_acc.append((acc['ents_p']+acc['ents_r']+acc['ents_f']) / 3)
+    pipes = meta["pipeline"]
+    acc = meta["accuracy"]
+    if "tagger" in pipes:
+        mean_acc.append(acc["tags_acc"])
+    if "parser" in pipes:
+        mean_acc.append((acc["uas"] + acc["las"]) / 2)
+    if "ner" in pipes:
+        mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3)
     return sum(mean_acc) / len(mean_acc)
 
+
 @contextlib.contextmanager
 def _create_progress_bar(total):
     if int(os.environ.get("LOG_FRIENDLY", 0)):
diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index fadbaaa7e..b651c0996 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -19,7 +19,7 @@ RENDER_WRAPPER = None
 
 
 def render(
-    docs, style="dep", page=False, minify=False, jupyter=False, options={}, manual=False
+    docs, style="dep", page=False, minify=False, jupyter=None, options={}, manual=False
 ):
     """Render displaCy visualisation.
 
@@ -27,7 +27,7 @@ def render(
     style (unicode): Visualisation style, 'dep' or 'ent'.
     page (bool): Render markup as full HTML page.
     minify (bool): Minify HTML markup.
-    jupyter (bool): Experimental, use Jupyter's `display()` to output markup.
+    jupyter (bool): Override Jupyter auto-detection.
     options (dict): Visualiser-specific options, e.g. colors.
     manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
     RETURNS (unicode): Rendered HTML markup.
@@ -53,7 +53,8 @@ def render(
     html = _html["parsed"]
     if RENDER_WRAPPER is not None:
         html = RENDER_WRAPPER(html)
-    if jupyter or is_in_jupyter():  # return HTML rendered by IPython display()
+    if jupyter or (jupyter is None and is_in_jupyter()):
+        # return HTML rendered by IPython display()
         from IPython.core.display import display, HTML
 
         return display(HTML(html))
diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py
index 0bd8333db..ba5b86d77 100644
--- a/spacy/lang/th/__init__.py
+++ b/spacy/lang/th/__init__.py
@@ -4,11 +4,13 @@ from __future__ import unicode_literals
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
+from .norm_exceptions import NORM_EXCEPTIONS
 
-from ...attrs import LANG
+from ..norm_exceptions import BASE_NORMS
+from ...attrs import LANG, NORM
 from ...language import Language
 from ...tokens import Doc
-from ...util import DummyTokenizer
+from ...util import DummyTokenizer, add_lookups
 
 
 class ThaiTokenizer(DummyTokenizer):
@@ -33,7 +35,9 @@ class ThaiTokenizer(DummyTokenizer):
 class ThaiDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
     lex_attr_getters[LANG] = lambda _text: "th"
-
+    lex_attr_getters[NORM] = add_lookups(
+        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
+    )
     tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
     tag_map = TAG_MAP
     stop_words = STOP_WORDS
diff --git a/spacy/lang/th/norm_exceptions.py b/spacy/lang/th/norm_exceptions.py
new file mode 100644
index 000000000..497779cf9
--- /dev/null
+++ b/spacy/lang/th/norm_exceptions.py
@@ -0,0 +1,114 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+_exc = {
+    # Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์)
+    "สนุ๊กเกอร์": "สนุกเกอร์",
+    "โน้ต": "โน้ต",
+    # Misspelled because of being lazy or hustle (สะกดผิดเพราะขี้เกียจพิมพ์ หรือเร่งรีบ)
+    "โทสับ": "โทรศัพท์",
+    "พุ่งนี้": "พรุ่งนี้",
+    # Strange (ให้ดูแปลกตา)
+    "ชะมะ": "ใช่ไหม",
+    "ชิมิ": "ใช่ไหม",
+    "ชะ": "ใช่ไหม",
+    "ช่ายมะ": "ใช่ไหม",
+    "ป่าว": "เปล่า",
+    "ป่ะ": "เปล่า",
+    "ปล่าว": "เปล่า",
+    "คัย": "ใคร",
+    "ไค": "ใคร",
+    "คราย": "ใคร",
+    "เตง": "ตัวเอง",
+    "ตะเอง": "ตัวเอง",
+    "รึ": "หรือ",
+    "เหรอ": "หรือ",
+    "หรา": "หรือ",
+    "หรอ": "หรือ",
+    "ชั้น": "ฉัน",
+    "ชั้ล": "ฉัน",
+    "ช้าน": "ฉัน",
+    "เทอ": "เธอ",
+    "เทอร์": "เธอ",
+    "เทอว์": "เธอ",
+    "แกร": "แก",
+    "ป๋ม": "ผม",
+    "บ่องตง": "บอกตรงๆ",
+    "ถ่ามตง": "ถามตรงๆ",
+    "ต่อมตง": "ตอบตรงๆ",
+    "เพิ่ล": "เพื่อน",
+    "จอบอ": "จอบอ",
+    "ดั้ย": "ได้",
+    "ขอบคุง": "ขอบคุณ",
+    "ยังงัย": "ยังไง",
+    "Inw": "เทพ",
+    "uou": "นอน",
+    "Lกรีeu": "เกรียน",
+    # Misspelled to express emotions (คำที่สะกดผิดเพื่อแสดงอารมณ์)
+    "เปงราย": "เป็นอะไร",
+    "เปนรัย": "เป็นอะไร",
+    "เปงรัย": "เป็นอะไร",
+    "เป็นอัลไล": "เป็นอะไร",
+    "ทามมาย": "ทำไม",
+    "ทามมัย": "ทำไม",
+    "จังรุย": "จังเลย",
+    "จังเยย": "จังเลย",
+    "จุงเบย": "จังเลย",
+    "ไม่รู้": "มะรุ",
+    "เฮ่ย": "เฮ้ย",
+    "เห้ย": "เฮ้ย",
+    "น่าร็อค": "น่ารัก",
+    "น่าร๊าก": "น่ารัก",
+    "ตั้ลล๊าก": "น่ารัก",
+    "คือร๊ะ": "คืออะไร",
+    "โอป่ะ": "โอเคหรือเปล่า",
+    "น่ามคาน": "น่ารำคาญ",
+    "น่ามสาร": "น่าสงสาร",
+    "วงวาร": "สงสาร",
+    "บับว่า": "แบบว่า",
+    "อัลไล": "อะไร",
+    "อิจ": "อิจฉา",
+    # Reduce rough words or Avoid to software filter (คำที่สะกดผิดเพื่อลดความหยาบของคำ หรืออาจใช้หลีกเลี่ยงการกรองคำหยาบของซอฟต์แวร์)
+    "กรู": "กู",
+    "กุ": "กู",
+    "กรุ": "กู",
+    "ตู": "กู",
+    "ตรู": "กู",
+    "มรึง": "มึง",
+    "เมิง": "มึง",
+    "มืง": "มึง",
+    "มุง": "มึง",
+    "สาด": "สัตว์",
+    "สัส": "สัตว์",
+    "สัก": "สัตว์",
+    "แสรด": "สัตว์",
+    "โคโตะ": "โคตร",
+    "โคด": "โคตร",
+    "โครต": "โคตร",
+    "โคตะระ": "โคตร",
+    "พ่อง": "พ่อมึง",
+    "แม่เมิง": "แม่มึง",
+    "เชี่ย": "เหี้ย",
+    # Imitate words (คำเลียนเสียง โดยส่วนใหญ่จะเพิ่มทัณฑฆาต หรือซ้ำตัวอักษร)
+    "แอร๊ยย": "อ๊าย",
+    "อร๊ายยย": "อ๊าย",
+    "มันส์": "มัน",
+    "วู๊วววววววว์": "วู้",
+    # Acronym (แบบคำย่อ)
+    "หมาลัย": "มหาวิทยาลัย",
+    "วิดวะ": "วิศวะ",
+    "สินสาด ": "ศิลปศาสตร์",
+    "สินกำ ": "ศิลปกรรมศาสตร์",
+    "เสารีย์ ": "อนุเสาวรีย์ชัยสมรภูมิ",
+    "เมกา ": "อเมริกา",
+    "มอไซค์ ": "มอเตอร์ไซค์",
+}
+
+
+NORM_EXCEPTIONS = {}
+
+for string, norm in _exc.items():
+    NORM_EXCEPTIONS[string] = norm
+    NORM_EXCEPTIONS[string.title()] = norm
+
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 29e050964..a6a8bf247 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1316,6 +1316,28 @@
             "author_links": {
                 "github": "oterrier"
             }
+        },
+        {
+            "id": "pyInflect",
+            "slogan": "A python module for word inflections",
+            "description": "This package uses the [spaCy 2.0 extensions](https://spacy.io/usage/processing-pipelines#extensions) to add word inflections to the system.",
+            "github": "bjascob/pyInflect",
+            "pip": "pyinflect",
+            "code_example": [
+                "import spacy",
+                "import pyinflect",
+                "",
+                "nlp = spacy.load('en_core_web_sm')",
+                "doc = nlp('This is an example.')",
+                "doc[3].tag_                # NN",
+                "doc[3]._.inflect('NNS')    # examples"
+            ],
+            "author": "Brad Jascob",
+            "author_links": {
+                "github": "bjascob"
+            },
+            "category": ["pipeline"],
+            "tags": ["inflection"]
         }
     ],
     "categories": [