前言
利用此方法针对大量的报名表进行信息提取~
安装工具包
pip install python-docx
表格信息
代码
注意读取的EXCEL文件只能是docx后缀的噢~若文件太多可利用以下方法批量转化
import os
import docx
import xlwt
import shutil
from win32com import client as wc
# 把doc文档转成docx文档
def convert_word(path, target_dir):
filename = os.path.basename(path)
rename = os.path.splitext(filename)
new_file = target_dir + '/' + rename[0] + '.docx'
source_path = os.path.abspath(path)
target_path = os.path.abspath(new_file)
word = wc.Dispatch('Word.Application')
doc = word.Documents.Open(source_path)
doc.SaveAs(target_path, 12)
doc.Close()
word.Quit()
表格信息提取
doc = docx.Document('报名表.docx')
tables = doc.tables
fields = [
'姓名',
'性别',
'专业',
'班级',
'联系电话',
'电子邮箱'
]
for table in tables:
table_data = dict()
for key in fields:
table_data[key] = ''
for row in table.rows:
tmp_last = ''
for cell in row.cells:
text = cell.text.strip()
if text == tmp_last:
continue
if tmp_last in fields:
table_data[tmp_last] = text
tmp_last = text.replace(' ', '')