* Add util.py

2026-03-05 04:11:26 +03:00 · 2014-09-25 18:26:22 +02:00 · 2014-09-25 18:26:22 +02:00 · 2e44fa7179
commit 2e44fa7179
parent c4cd3bc57a
1 changed files with 64 additions and 0 deletions
--- a/spacy/util.py
+++ b/spacy/util.py
@ -0,0 +1,64 @@
+import os
+from os import path
+import codecs
+import json
+import re
+
+DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
+
+
+def utf8open(loc, mode='r'):
+    return codecs.open(loc, mode, 'utf8')
+
+
+def read_lang_data(name):
+    data_dir = path.join(DATA_DIR, name)
+    tokenization = read_tokenization(data_dir)
+    prefix = read_prefix(data_dir)
+    suffix = read_suffix(data_dir)
+    words = load_resource(data_dir, 'words')
+    probs = load_resource(data_dir, 'probs')
+    clusters = load_resource(data_dir, 'clusters')
+    case_stats = load_resource(data_dir, 'case_stats')
+    tag_stats = load_resource(data_dir, 'tag_stats')
+    return tokenization, prefix, suffix, words, probs, clusters, case_stats, tag_stats
+
+
+def load_resource(data_dir, name):
+    loc = path.join(data_dir, name + '.json')
+    return json.load(loc) if path.exists(loc) else {}
+
+def read_prefix(data_dir):
+    with  utf8open(path.join(data_dir, 'prefix')) as file_:
+        entries = file_.read().split('\n')
+        expression = '|'.join(['^' + re.escape(piece) for piece in entries])
+    return expression
+
+def read_suffix(data_dir):
+    with  utf8open(path.join(data_dir, 'suffix')) as file_:
+        entries = file_.read().split('\n')
+        expression = '|'.join([re.escape(piece) + '$' for piece in entries])
+    return expression
+
+def read_tokenization(lang):
+    loc = path.join(DATA_DIR, lang, 'tokenization')
+    entries = []
+    seen = set()
+    with utf8open(loc) as file_:
+        for line in file_:
+            line = line.strip()
+            if line.startswith('#'):
+                continue
+            if not line:
+                continue
+            pieces = line.split()
+            chunk = pieces.pop(0)
+            assert chunk not in seen, chunk
+            seen.add(chunk)
+            entries.append((chunk, list(pieces)))
+            if chunk[0].isalpha() and chunk[0].islower():
+                chunk = chunk[0].title() + chunk[1:]
+                pieces[0] = pieces[0][0].title() + pieces[0][1:]
+                seen.add(chunk)
+                entries.append((chunk, pieces))
+    return entries