码迷,mamicode.com
首页 > 其他好文 > 详细

带线表格据gt生成无线表格

时间:2019-05-15 14:35:51      阅读:116      评论:0      收藏:0      [点我收藏+]

标签:encoding   app   处理   dir   os.path   ima   objects   ict   span   

json解析练习,python图像处理练习,表格包围框毛刺问题待解决。

  1 # -*- coding: utf-8 -*-
  2 # coding: utf-8
  3 from PIL import Image, ImageDraw
  4 import cv2
  5 import os
  6 import csv
  7 import json
  8 color = {
  9     0: [255, 0, 0],
 10     1: [0, 255, 0],
 11     2: [0, 0, 255]
 12 }
 13 from PIL import Image
 14 import numpy as np
 15 
 16 # 取批量点取RGB众数,来推算背景色值
 17 def publicnum(num, d=0):
 18     dictnum = {}
 19     for i in range(len(num)):
 20         if str(num[i]) in dictnum.keys():
 21             dictnum[str(num[i])] += 1
 22         else:
 23             dictnum.setdefault(str(num[i]), 1)
 24     maxnum = 0
 25     maxkey = [255 255 255]
 26     for k, v in dictnum.items():
 27         if v >= maxnum:
 28             maxnum = v
 29             maxkey = k
 30     return maxkey
 31 
 32 page_img_dir = "JPG"
 33 output_dir = "年报_PDF_TABLE_JPG_eliminate_lines-5-15-final"
 34 
 35 isExists = os.path.exists(output_dir)
 36 if not isExists:
 37     os.makedirs(output_dir)
 38 csv_file = csv.reader(open(表格结构标注-带线年报_UTF-8.csv, r, encoding="gbk"))
 39 data_list = []
 40 for data in csv_file:
 41     data_list.append(data)
 42 
 43 print("page number: ", len(data_list) - 1)
 44 
 45 for data in data_list[1:]:  # 跳过第一行
 46     img_path = data[0]
 47     img_name = img_path.split(/)[-1]  # /分割后最后一个为名字
 48     pdf_name = img_name.split(_)[0]  # -分割后 第一个是名字
 49     local_img_path = os.path.join(page_img_dir, img_name)  # 拼接路径
 50     print(local_img_path)
 51     annotation = json.loads(data[2])  # json单元格读取
 52     objects = annotation[objects]  # object是一个列表,读取该列表
 53     cnt = 0
 54     tu = Image.open(local_img_path)
 55     page_img = np.array(tu)
 56     for page_object in objects:
 57         if cur in page_object.keys():
 58             cur = page_object[cur]
 59         else:
 60             cur = cnt
 61         polygon = page_object[polygon][ptList]
 62         x_list = [p[x] for p in polygon]
 63         y_list = [p[y] for p in polygon]
 64         x_min = min(x_list)
 65         x_max = max(x_list)
 66         y_min = min(y_list)
 67         y_max = max(y_list)
 68         if abs(x_max - x_min) < 20:  # 纵向线条
 69             xx = int((x_min + x_max) / 2)
 70             inline_y_list = [y_max+20, y_min]
 71             #寻找相交横线分割点
 72             for in_page_object in objects:
 73                 in_polygon = in_page_object[polygon][ptList]
 74                 in_x_list = [in_p[x] for in_p in in_polygon]
 75                 in_y_list = [in_p[y] for in_p in in_polygon]
 76                 in_x_min = min(in_x_list)
 77                 in_x_max = max(in_x_list)
 78                 in_y_min = min(in_y_list)
 79                 in_y_max = max(in_y_list)
 80                 if in_y_max - in_y_min < 20:  # 判断为横线
 81                     if in_x_max+5 >= xx and in_x_min-5 <= xx:  # 判断相交
 82                         point_y = in_y_min
 83                         inline_y_list.append(point_y)
 84                         if 0<abs(y_max-point_y)< 10:
 85                             try:
 86                                 inline_y_list.remove(max(y_max+20, point_y))
 87                                 inline_y_list.append(min(y_max+20, point_y))
 88                             except:
 89                                 pass
 90                         elif 0 < abs(y_min-point_y) < 10:
 91                             try:
 92                                 inline_y_list.remove(min(y_min, point_y))
 93                                 inline_y_list.append(max(y_min, point_y))
 94                             except:
 95                                 pass
 96             inline_y_list = list({}.fromkeys(inline_y_list).keys())
 97             inline_y_list.sort()
 98             inline_y_list[-1]+=5
 99             if inline_y_list[-1]>2339:
100                 inline_y_list[-1]=2339
101             # 线条分割结束
102             for i in range(0, inline_y_list.__len__()):
103                 if i < inline_y_list.__len__() - 1:
104                     # 开始取样
105                     back_colors = []
106                     for yy in range(inline_y_list[i], inline_y_list[i + 1]):
107                         if xx + 8 < 1654:
108                             back_colors.append(page_img[yy, xx + 8])
109                         else:
110                             back_colors.append(page_img[yy, xx - 8])
111                     back_color = publicnum(back_colors)
112                     back_color = back_color[1:-1]
113                     try:
114                         back_color = back_color.split( )
115                     except:
116                         back_color = back_color.split(   )
117                         print(type(back_color))
118                     if len(back_color) > 3:
119                         back_color = list(filter(None, back_color))
120                     # 取样结束
121                     # 纵向填色
122 
123                     for yy in range(inline_y_list[i]-4, inline_y_list[i + 1]-4):
124                         if y_min-20<inline_y_list[i]<y_max+20 :
125                             for ranging in range(-4, x_max-x_min+5):
126                                 if x_min+ranging < 1654 and x_min+ranging >= 0:
127                                         page_img[yy, x_min + ranging] = back_color
128                                 else:
129                                     pass
130                         else:
131                             pass
132         elif abs(y_max - y_min) < 20:  # 横向线条
133             yy = int((y_min + y_max) / 2)
134             inline_x_list = [x_max+20, x_min]
135             # 寻找相交横线分割点
136             for in_page_object2 in objects:
137                 polygon2 = in_page_object2[polygon][ptList]
138                 in_x_list2 = [p[x] for p in polygon2]
139                 in_y_list2 = [p[y] for p in polygon2]
140                 in_x_min = min(in_x_list2)
141                 in_x_max = max(in_x_list2)
142                 in_y_min = min(in_y_list2)
143                 in_y_max = max(in_y_list2)
144                 if abs(in_x_max - in_x_min) < 20:  # 判断为纵线
145                     if in_y_max+5 >= y_min and in_y_min-5 <= y_max:  # 判断相交
146                         point_x = in_x_min
147                         inline_x_list.append(point_x)
148                         if 0<abs(x_max-point_x)<10:
149                             try:
150                                 inline_x_list.remove(max(x_max+20, point_x))
151                                 inline_x_list.append(min(x_max+20, point_x))
152                             except:
153                                 pass
154                         elif 0<abs(x_min-point_x)<10:
155                             try:
156                                 inline_x_list.remove(min(x_min, point_x))
157                                 inline_x_list.append(max(x_min, point_x))
158                             except:
159                                 pass
160                         else:
161                             pass
162             inline_x_list = list({}.fromkeys(inline_x_list).keys())
163             inline_x_list.sort()
164             #inline_x_list[-1]+=5
165             # 线条分割结束
166             for i in range(0, inline_x_list.__len__()):
167                 if i < inline_x_list.__len__() - 1:
168                     # 开始取样
169                     back_colors = []
170                     for xx in range(inline_x_list[i], inline_x_list[i + 1]):
171                         if yy+8 < 2339:
172                             back_colors.append(page_img[yy + 8, xx])
173                         else:
174                             back_colors.append(page_img[yy - 8, xx])
175                     back_color = publicnum(back_colors)
176                     back_color = back_color[1:-1]
177                     try:
178                         back_color = back_color.split( )
179                     except:
180                         back_color = back_color.split(   )
181 
182                     if len(back_color) > 3:
183                         back_color = list(filter(None, back_color))
184                     # 取样结束
185                     # 横线填色
186                     for xx in range(inline_x_list[i]-4, inline_x_list[i + 1]-4):
187                         if x_min-20<inline_x_list[i]<x_max+20:
188                             for ranging in range(-4, y_max-y_min+5):
189                                 if y_min+ranging < 2339 and y_min+ranging >= 0:
190                                     page_img[y_min+ranging, xx] = back_color
191                                 elif y_min+ranging>=2339:
192                                     page_img[2338, xx] = back_color
193                                 else:
194                                     page_img[0, xx] = back_color
195                         else:
196                             pass
197         else:
198             print("no such line", x_min:, x_min,x_max:, x_max, y_max:, y_max, y_min:, y_min)
199     tu = Image.fromarray(page_img.astype(uint8))
200     output_path = os.path.join(output_dir, img_name.split(.)[0] + _ + str(cur) + ".jpg")
201     tu.save(output_path)
202     cv2.imwrite(output_path, page_img)
203     cnt += 1

 

带线表格据gt生成无线表格

标签:encoding   app   处理   dir   os.path   ima   objects   ict   span   

原文地址:https://www.cnblogs.com/wind-chaser/p/10868935.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!