parser.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. import email
  2. from email.message import Message
  3. from email.policy import default
  4. import json
  5. from datetime import datetime
  6. from lxml import etree
  7. def clean_xml(xml_string):
  8. """移除 XML 中的命名空间,包括 xmlns 声明"""
  9. parser = etree.XMLParser(remove_blank_text=True)
  10. root = etree.XML(xml_string, parser)
  11. # 遍历所有元素,移除命名空间
  12. for elem in root.iter():
  13. if '}' in elem.tag: # 如果有命名空间
  14. elem.tag = elem.tag.split('}', 1)[1] # 移除命名空间
  15. if 'version' in elem.attrib:
  16. del elem.attrib['version']
  17. etree.cleanup_namespaces(root)
  18. return etree.tostring(root, pretty_print=True, encoding="unicode")
  19. def parse_xml_to_dict(xml):
  20. """
  21. 将xml文件解析成字典形式,参考tensorflow的recursive_parse_xml_to_dict
  22. Args:
  23. xml: xml tree obtained by parsing XML file contents using lxml.etree
  24. Returns:
  25. Python dictionary holding XML contents.
  26. """
  27. if len(xml) == 0: # 遍历到底层,直接返回tag对应的信息
  28. return{xml.tag: xml.text}
  29. result= {}
  30. for child in xml:
  31. child_result= parse_xml_to_dict(child) # 递归遍历标签信息
  32. if child.tag not in ['pictureInfo', 'ANPR', "PictureURLInfo"]:
  33. result[child.tag] = child_result[child.tag]
  34. else:
  35. if child.tag not in result: # 因为object可能有多个,所以需要放入列表里
  36. result[child.tag] = []
  37. result[child.tag].append(child_result[child.tag])
  38. return {xml.tag: result}
  39. def parse_multipart(data, boundary):
  40. # 按 boundary 分割数据
  41. parts = data.split(boundary)
  42. parsed_data = {}
  43. for part in parts:
  44. # print("part : ", part)
  45. part = part.strip()
  46. # print("part strip: ", part)
  47. if not part: continue
  48. # 获取头部和主体部分
  49. headers, _, body = part.partition(b"\r\n\r\n")
  50. # print("headers :", headers)
  51. if not headers: continue
  52. header_dict = email.message_from_bytes(headers, policy=default)
  53. # print("header_dict : ", header_dict)
  54. # 获取 Content-Disposition
  55. content_disposition = header_dict.get("Content-Disposition", "")
  56. # print("content_disposition :", content_disposition)
  57. if "name=" in content_disposition:
  58. name = content_disposition.split('name="')[1].split('"')[0]
  59. filename = None
  60. if 'filename="' in content_disposition:
  61. filename = content_disposition.split('filename="')[1].split('"')[0]
  62. # 提取 Content-Type
  63. content_type = header_dict.get("Content-Type", "").strip()
  64. # 根据 Content-Type 处理内容
  65. if content_type == "application/json":
  66. body_content = json.loads(body.decode("utf-8"))
  67. elif content_type == "application/xml":
  68. body = body.decode('utf-8').replace("</ ", "</").encode()
  69. clean_xml_str = clean_xml(body)
  70. xml = etree.fromstring(clean_xml_str)
  71. body_content = parse_xml_to_dict(xml) # 直接存储 XML 原始内容
  72. elif content_type.startswith("image/"):
  73. body_content = body # 图像以二进制形式存储
  74. # with open(filename, "wb") as f:
  75. # f.write(body)
  76. else:
  77. body_content = body.decode() # 其他情况
  78. # 存储解析的数据
  79. parsed_data[name] = {
  80. "filename": filename,
  81. "content_type": content_type,
  82. "content": body_content,
  83. }
  84. return parsed_data
  85. def parser_json_data(data):
  86. infos = []
  87. images = []
  88. for _, value in data.items():
  89. content_type = value.get("content_type")
  90. if not content_type:
  91. continue
  92. if content_type == "application/xml":
  93. # 安全提取嵌套字段
  94. content = value.get("content")
  95. if content is None:
  96. continue # 跳过内容为空的记录
  97. event_alert = content.get("EventNotificationAlert")
  98. if event_alert is None:
  99. continue # 如果没有 EventNotificationAlert,跳过当前记录
  100. iso_time = event_alert.get("dateTime")
  101. dt = datetime.fromisoformat(iso_time)
  102. dateTime = dt.strftime("%Y-%m-%d %H:%M:%S")
  103. tfs = event_alert.get("TFS")
  104. if tfs is None:
  105. continue # 如果没有 TFS,跳过当前记录
  106. VehicleInfo = tfs.get("VehicleInfo")
  107. PlateInfo = tfs.get("PlateInfo")
  108. picture_info_list = event_alert.get("PictureURLInfoList", {}).get("PictureURLInfo", [])
  109. # 对每个字段进行逐一检查,确保有效后再添加到 infos
  110. if VehicleInfo is None or PlateInfo is None or not picture_info_list:
  111. continue # 如果任何重要字段为空,跳过当前记录
  112. infos.append({
  113. "dateTime" : dateTime,
  114. "pictureInfo": picture_info_list,
  115. "VehicleInfo": VehicleInfo,
  116. "PlateInfo": PlateInfo
  117. })
  118. elif content_type == "image/jpeg":
  119. # 安全提取图片内容
  120. filename = value.get("filename")
  121. image_content = value.get("content")
  122. # 确保图片相关字段都有效后才添加
  123. if filename and image_content:
  124. images.append({
  125. "filename": filename,
  126. "content": image_content
  127. })
  128. return infos, images
  129. if __name__ == "__main__":
  130. with open("tfs_0.txt", "r") as f:
  131. multipart_data = eval(f.read())
  132. # print("===============", multipart_data[:3000])
  133. # 示例调用
  134. # multipart_data = b'---------------------------7e13971310878\r\n\r\nContent-Disposition: form-data; name="Q3531473496730912851860"; filename="radarVideoDetection.json"\r\n\r\nContent-Type: application/json\r\n\r\nContent-Length: 400\r\n\r\n\r\n\r\n{"ipAddress":"172.19.152.181","protocol":"HTTP","macAddress":"a4:d5:c2:02:96:bf","channelID":1,"dateTime":"2024-12-03T15:30:41.516+08:00","activePostCount":1,"eventType":"radarVideoDetection","eventState":"active","eventDescription":"Radar Video Detection","freezingTimeInfo":{"freezingTimestamp":96730747,"freezingSystemDateTime":"2024-12-03T15:30:41.360"},"Datas":[],"algorithmDataFrames":"966772"}\r\n\r\n---------------------------7e13971310878--\r\n\r\n'
  135. # print(multipart_data[0:3000])
  136. boundary = b'---------------------------7e13971310878'
  137. result = parse_multipart(multipart_data, boundary)
  138. print(result)
  139. infos, images = parser_json_data(result)
  140. print(infos, "======", images)
  141. # with open("res.xml", "r") as f:
  142. # xml_str = f.read()
  143. # clean_xml_str = clean_xml(xml_str.encode())
  144. # xml = etree.fromstring(clean_xml_str)
  145. # print(parse_xml_to_dict(xml))