From 393a13d1af2a0c22a04643e61e7c4b95b653250b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 19:24:33 +1100 Subject: [PATCH] * Add unicode em dash to specials.json, so that we can control what POS tag it gets. This way we can prevent sentence boundary detection errors, to address Issue #130. --- lang_data/en/generate_specials.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lang_data/en/generate_specials.py b/lang_data/en/generate_specials.py index 6ad503aec..e50cd77d4 100644 --- a/lang_data/en/generate_specials.py +++ b/lang_data/en/generate_specials.py @@ -133,6 +133,9 @@ hardcoded_specials = { "Mt.": [{"F": "Mt.", "L": "Mount"}], "''": [{"F": "''"}], + + "—": [{"F": "—", "L": "--", "P": ":"}], + "Corp.": [{"F": "Corp."}], "Inc.": [{"F": "Inc."}], "Co.": [{"F": "Co."}],