思路
准备写个大家来找茬的文字版,需要找出所有中文的形状相似字符对,已知GBK下33088-65278为中文编码。找个字库文件比如FZLanTingHei-R-GBK.TTF,把字符黑白矩阵转成一个向量,然后算所有文字对的COS相似度就可以了,算法很简单。 开搞!
代码
import string
from PIL import Image
from PIL import ImageFont
from PIL import ImageDraw
import numpy as np
fobidden_chars = {'■','●','︱', '|','▉' ,'▼','▊','█', '▇','▇'}.union(set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'))
def char_to_pixels(text, path='font.ttf', fontsize=80):
font = ImageFont.truetype(path, fontsize)
#getsize lead to w, h change for diffrent chars
#w, h = font.getsize(text)
w, h = fontsize+5, fontsize+5
#print(w,h)
h *= 2
image = Image.new('L', (w, h), 1)
draw = ImageDraw.Draw(image)
draw.text((0, 0), text, font=font)
arr = np.asarray(image)
arr = np.where(arr, 0, 1)
#arr = arr[(arr != 0).any(axis=1)]
return arr
def display(arr):
result = np.where(arr, '1', '0')
print('\n'.join([''.join(row) for row in result]))
def get_distance(vecs,me):
#calculate the cos distance of me with other vecs
dst = (np.dot(vecs, me) / np.linalg.norm(vecs, axis=1) / np.linalg.norm(me))
return dst
def get_closest_chars (li_arrs, vec,li_gbk_chars, number=6):
dst = get_distance(li_arrs, vec)
gbk_char_ids = np.argsort(-dst)
return [li_gbk_chars[ind] for ind in gbk_char_ids[:number] ]
if __name__ == '__main__':
li_gbk_chars = []
li_arrs = []
#GBK 33088-65278
for ind in range(33088,65278+1):
try:
gbk_char = bytes.fromhex(str(hex(ind))[2:]).decode("GBK")
if gbk_char in fobidden_chars: continue
arr = char_to_pixels(gbk_char,'./FZLanTingHei-R-GBK.TTF')
#print(gbk_char)
#display(arr)
li_gbk_chars.append(gbk_char)
#print(gbk_char, arr)
li_arrs.append(arr.reshape(-1))
#print(arr.reshape(-1).shape)
except:
pass