【脚本】python解析搜狗词库

项目简介

最近工作需要一部专有名词实体识别标注工作，在缺少人工的情况下，用字典进行一批预标注工作，找了好多资料，发现还是输入法的词库覆盖的专有名词比较全面。

技术背景

搜狗词库下载后的文件格式为 .scel 二进制格式，文本为unicode编码，每两个字节一个字符，调整其每部分的偏移位置即可完成解析。
搜狗词库链接：https://pinyin.sogou.com/dict/

相关代码

import struct

class SouGo(object):
    """
    sg=SouGo(path="./词表.seil")
    sg.save(path=new_path)
    """
    start_pinyin = 0x1540  # 拼音起始位
    start_word = 0x2628  # 词汇起始位
    lexicon_name = slice(0x130, 0x338)
    lexicon_type = slice(0x338, 0x540)
    lexicon_describe = slice(0x540, 0xd40)
    lexicon_demo = slice(0xd40, start_pinyin)

    def __init__(self, file_path):
        with open(file_path, 'rb') as f:
            self.data = f.read()
        self.pinyin = {}
        self.dictionary = []
        self.init()

    def init(self):
        self.info()
        self.build_pinyin(self.data[self.start_pinyin:self.start_word])
        self.build_tokens(self.data[self.start_word:])

    def info(self):
        def show(slice_):
            return self.bytes2str(self.data[slice_])

        print("-" * 60 + "\n")
        print(f"词库名：{show(self.lexicon_name)}")
        print(f"词库类型：{show(self.lexicon_type)}")
        print(f"描述信息：{show(self.lexicon_describe)}")
        print(f"词库示例：{show(self.lexicon_demo)}")
        print("-" * 60 + "\n")
        print("\n")
        return None

    def bytes2str(self, data):
        pos = 0
        string = ''
        while pos < len(data):
            c = chr(self.unpack(data, pos))
            if c != chr(0):
                string += c
            pos += 2
        return string

    @staticmethod
    def unpack(data, position):
        return struct.unpack('H', bytes([data[position], data[position + 1]]))[0]

    def build_pinyin(self, data):
        """构建拼音表"""
        data = data[4:]
        pos = 0
        while pos < len(data):
            index = self.unpack(data, pos)
            pos, pinyin = self._match(data, pos + 2, func=self.bytes2str)
            self.pinyin[index] = pinyin
        return self.pinyin

    def get_pinyin(self, data):
        """获取拼音"""
        pos = 0
        ret = ''
        while pos < len(data):
            index = self.unpack(data, pos)
            ret += self.pinyin[index]
            pos += 2
        return ret

    def _match(self, data, position, func):
        """抽象出来的逻辑，用于使用索引+长度的方式匹配数据"""
        length = self.unpack(data, position)
        position += 2
        ret = func(data[position:position + length])
        position += length
        return position, ret

    def build_tokens(self, data):
        """构建词表"""
        pos = 0
        while pos < len(data):
            same = self.unpack(data, pos)  # 同音词数量
            pos, pinyin = self._match(data, pos + 2, func=self.get_pinyin)
            for i in range(same):
                pos, word = self._match(data, pos, func=self.bytes2str)
                ext_len = self.unpack(data, pos)
                pos += 2  # 词频
                count = self.unpack(data, pos)
                self.dictionary.append((count, pinyin, word))  # 保存
                pos += ext_len  # 到下个词的偏移位置
        return self.dictionary

    def __call__(self, *args, **kwargs):
        return self.dictionary

    def save(self, path, all_info=False):
        if all_info:
            dictionary = ["\t\t\t".join(map(str, i)) + "\n" for i in self.dictionary]
        else:
            dictionary = [f"{i[-1]}\n" for i in self.dictionary]
        with open(path, "w", encoding="utf-8") as f:
            f.writelines(dictionary)

    def __len__(self):
        return len(self.dictionary)

资料参考

寒江共雪博客