目录
  1. 项目简介
  2. 技术背景
  3. 相关代码
  4. 资料参考
【脚本】python解析搜狗词库

项目简介

最近工作需要一部专有名词实体识别标注工作,在缺少人工的情况下,用字典进行一批预标注工作,找了好多资料,发现还是输入法的词库覆盖的专有名词比较全面。

技术背景

  • 搜狗词库下载后的文件格式为 .scel 二进制格式,文本为unicode编码,每两个字节一个字符,调整其每部分的偏移位置即可完成解析。
  • 搜狗词库链接:https://pinyin.sogou.com/dict/

相关代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import struct

class SouGo(object):
"""
sg=SouGo(path="./词表.seil")
sg.save(path=new_path)
"""
start_pinyin = 0x1540 # 拼音起始位
start_word = 0x2628 # 词汇起始位
lexicon_name = slice(0x130, 0x338)
lexicon_type = slice(0x338, 0x540)
lexicon_describe = slice(0x540, 0xd40)
lexicon_demo = slice(0xd40, start_pinyin)

def __init__(self, file_path):
with open(file_path, 'rb') as f:
self.data = f.read()
self.pinyin = {}
self.dictionary = []
self.init()

def init(self):
self.info()
self.build_pinyin(self.data[self.start_pinyin:self.start_word])
self.build_tokens(self.data[self.start_word:])

def info(self):
def show(slice_):
return self.bytes2str(self.data[slice_])

print("-" * 60 + "\n")
print(f"词库名:{show(self.lexicon_name)}")
print(f"词库类型:{show(self.lexicon_type)}")
print(f"描述信息:{show(self.lexicon_describe)}")
print(f"词库示例:{show(self.lexicon_demo)}")
print("-" * 60 + "\n")
print("\n")
return None

def bytes2str(self, data):
pos = 0
string = ''
while pos < len(data):
c = chr(self.unpack(data, pos))
if c != chr(0):
string += c
pos += 2
return string

@staticmethod
def unpack(data, position):
return struct.unpack('H', bytes([data[position], data[position + 1]]))[0]

def build_pinyin(self, data):
"""构建拼音表"""
data = data[4:]
pos = 0
while pos < len(data):
index = self.unpack(data, pos)
pos, pinyin = self._match(data, pos + 2, func=self.bytes2str)
self.pinyin[index] = pinyin
return self.pinyin

def get_pinyin(self, data):
"""获取拼音"""
pos = 0
ret = ''
while pos < len(data):
index = self.unpack(data, pos)
ret += self.pinyin[index]
pos += 2
return ret

def _match(self, data, position, func):
"""抽象出来的逻辑,用于使用索引+长度的方式匹配数据"""
length = self.unpack(data, position)
position += 2
ret = func(data[position:position + length])
position += length
return position, ret

def build_tokens(self, data):
"""构建词表"""
pos = 0
while pos < len(data):
same = self.unpack(data, pos) # 同音词数量
pos, pinyin = self._match(data, pos + 2, func=self.get_pinyin)
for i in range(same):
pos, word = self._match(data, pos, func=self.bytes2str)
ext_len = self.unpack(data, pos)
pos += 2 # 词频
count = self.unpack(data, pos)
self.dictionary.append((count, pinyin, word)) # 保存
pos += ext_len # 到下个词的偏移位置
return self.dictionary

def __call__(self, *args, **kwargs):
return self.dictionary

def save(self, path, all_info=False):
if all_info:
dictionary = ["\t\t\t".join(map(str, i)) + "\n" for i in self.dictionary]
else:
dictionary = [f"{i[-1]}\n" for i in self.dictionary]
with open(path, "w", encoding="utf-8") as f:
f.writelines(dictionary)

def __len__(self):
return len(self.dictionary)

资料参考

寒江共雪博客

文章作者: Haibei
文章链接: http://www.haibei.online/posts/3236042050.html
版权声明: 本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 Haibei的博客
打赏
  • 微信
  • 支付宝

评论