mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			117 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			117 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from __future__ import unicode_literals
 | 
						|
import os
 | 
						|
from os import path
 | 
						|
import re
 | 
						|
 | 
						|
 | 
						|
def split(text):
 | 
						|
    """Split an annotation file by sentence. Each sentence's annotation should
 | 
						|
    be a single string."""
 | 
						|
    return text.strip().split('\n')[1:-1]
 | 
						|
    
 | 
						|
 | 
						|
def parse(string, strip_bad_periods=False):
 | 
						|
    """Given a sentence's annotation string, return a list of word strings,
 | 
						|
    and a list of named entities, where each entity is a (start, end, label)
 | 
						|
    triple."""
 | 
						|
    tokens = []
 | 
						|
    tags = []
 | 
						|
    open_tag = None
 | 
						|
    # Arbitrary corrections to promote alignment, and ensure that entities
 | 
						|
    # begin at a space. This allows us to treat entities as tokens, making it
 | 
						|
    # easier to return the list of entities.
 | 
						|
    string = string.replace('... .', '...')
 | 
						|
    string = string.replace('U.S.</ENAMEX> .', 'U.S.</ENAMEX>')
 | 
						|
    string = string.replace('Co.</ENAMEX> .', 'Co.</ENAMEX>')
 | 
						|
    string = string.replace('U.S. .', 'U.S.')
 | 
						|
    string = string.replace('<ENAMEX ', '<ENAMEX')
 | 
						|
    string = string.replace(' E_OFF="', 'E_OFF="')
 | 
						|
    string = string.replace(' S_OFF="', 'S_OFF="')
 | 
						|
    string = string.replace('units</ENAMEX>-<ENAMEX', 'units</ENAMEX> - <ENAMEX')
 | 
						|
    string = string.replace('<ENAMEXTYPE="PERSON"E_OFF="1">Paula</ENAMEX> Zahn', 'Paula Zahn')
 | 
						|
    string = string.replace('<ENAMEXTYPE="CARDINAL"><ENAMEXTYPE="CARDINAL">little</ENAMEX> drain</ENAMEX>', 'little drain')
 | 
						|
    for substr in string.strip().split():
 | 
						|
        substr = _fix_inner_entities(substr)
 | 
						|
        tokens.append(_get_text(substr))
 | 
						|
        try:
 | 
						|
            tag, open_tag = _get_tag(substr, open_tag)
 | 
						|
        except:
 | 
						|
            raise
 | 
						|
        tags.append(tag)
 | 
						|
    return tokens, tags
 | 
						|
 | 
						|
 | 
						|
tag_re = re.compile(r'<ENAMEXTYPE="[^"]+">')
 | 
						|
def _fix_inner_entities(substr):
 | 
						|
    tags = tag_re.findall(substr)
 | 
						|
    if '</ENAMEX' in substr and not substr.endswith('</ENAMEX'):
 | 
						|
            substr = substr.replace('</ENAMEX>', '') + '</ENAMEX>'
 | 
						|
    if tags:
 | 
						|
        substr = tag_re.sub('', substr)
 | 
						|
        return tags[0] + substr
 | 
						|
    else:
 | 
						|
        return substr
 | 
						|
 | 
						|
 | 
						|
def _get_tag(substr, tag):
 | 
						|
    if substr.startswith('<'):
 | 
						|
        tag = substr.split('"')[1]
 | 
						|
        if substr.endswith('>'):
 | 
						|
            return 'U-' + tag, None
 | 
						|
        else:
 | 
						|
            return 'B-%s' % tag, tag
 | 
						|
    elif substr.endswith('>'):
 | 
						|
        return 'L-' + tag, None
 | 
						|
    elif tag is not None:
 | 
						|
        return 'I-' + tag, tag
 | 
						|
    else:
 | 
						|
        return 'O', None
 | 
						|
 | 
						|
 | 
						|
def _get_text(substr):
 | 
						|
    if substr.startswith('<'):
 | 
						|
        substr = substr.split('>', 1)[1]
 | 
						|
    if substr.endswith('>'):
 | 
						|
        substr = substr.split('<')[0]
 | 
						|
    return reform_string(substr)
 | 
						|
 | 
						|
 | 
						|
def tags_to_entities(tags):
 | 
						|
    entities = []
 | 
						|
    start = None
 | 
						|
    for i, tag in enumerate(tags):
 | 
						|
        if tag.startswith('O'):
 | 
						|
            # TODO: We shouldn't be getting these malformed inputs. Fix this.
 | 
						|
            if start is not None:
 | 
						|
                start = None
 | 
						|
            continue
 | 
						|
        elif tag == '-':
 | 
						|
            continue
 | 
						|
        elif tag.startswith('I'):
 | 
						|
            assert start is not None, tags[:i]
 | 
						|
            continue
 | 
						|
        if tag.startswith('U'):
 | 
						|
            entities.append((tag[2:], i, i))
 | 
						|
        elif tag.startswith('B'):
 | 
						|
            start = i
 | 
						|
        elif tag.startswith('L'):
 | 
						|
            entities.append((tag[2:], start, i))
 | 
						|
            start = None
 | 
						|
        else:
 | 
						|
            raise Exception(tag)
 | 
						|
    return entities
 | 
						|
 | 
						|
 | 
						|
def reform_string(tok):
 | 
						|
    tok = tok.replace("``", '"')
 | 
						|
    tok = tok.replace("`", "'")
 | 
						|
    tok = tok.replace("''", '"')
 | 
						|
    tok = tok.replace('\\', '')
 | 
						|
    tok = tok.replace('-LCB-', '{')
 | 
						|
    tok = tok.replace('-RCB-', '}')
 | 
						|
    tok = tok.replace('-RRB-', ')')
 | 
						|
    tok = tok.replace('-LRB-', '(')
 | 
						|
    tok = tok.replace("'T-", "'T")
 | 
						|
    tok = tok.replace('-AMP-', '&')
 | 
						|
    return tok
 |