Python如何提取html中文本到txt
作者:彳亍261
这篇文章主要介绍了Python如何提取html中文本到txt问题,具有很好的参考价值,希望对大家有所帮助。如有错误或未考虑完全的地方,望不吝赐教
Python提取html中文本到txt
正则去标签方式
# -*- coding: utf-8 -*- import re def html_tag_rm(content: str): dr = re.compile(r'<[^>]+>',re.S) return dr.sub('',content)
nltk
比较笨重
需要安装依赖 nltk, numpy, pyyaml
# -*- coding: utf-8 -*- import nltk def html_tag_rm(content: str): return nltk.clean_html(content)
htmlParser
import re from sys import stderr from traceback import print_exc from HTMLParser import HTMLParser class _DeHTMLParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.__text = [] def handle_data(self, data): text = data.strip() if len(text) > 0: text = re.sub('[ \t\r\n]+', ' ', text) self.__text.append(text + ' ') def handle_starttag(self, tag, attrs): if tag == 'p': self.__text.append('\n\n') elif tag == 'br': self.__text.append('\n') def handle_startendtag(self, tag, attrs): if tag == 'br': self.__text.append('\n\n') def text(self): return ''.join(self.__text).strip() def dehtml(text): try: parser = _DeHTMLParser() parser.feed(text) parser.close() return parser.text() except: print_exc(file=stderr) return text def main(): text = r''''' <html> <body> <b>Project:</b> DeHTML<br> <b>Description</b>:<br> This small script is intended to allow conversion from HTML markup to plain text. </body> </html> ''' print(dehtml(text)) if __name__ == '__main__': main()
Python提取txt正则内容
其中:
pattern = re.compile(r'^.["“subject”"] [([^[])].*')
为修改的正则匹配部分
import re import pandas as pd with open("C:/data1.txt", 'r', encoding='UTF-8') as f: data = f.readlines() f.close() tol = [] for line in data: ##s = re.findall('[\u4e00-\u9fa5]', data) print(s) pattern = re.compile(r'^.*\[\"\"subject\"\"\] \[([^\[]*)\].*') string = str(line) url = re.findall(pattern,string) if (url is not None ) and (url != '[]'): tol.append(url) print(tol) pd.DataFrame(tol).to_csv('C:/tol2.csv') ##f1 = open("url.txt", "a+", encoding='utf-8') ##for urls in url: ## f1.write(urls + '\n') ##f1.close() ##reg = re.compile(r'^.*\[\"\"subject\"\"\] \[(.*)\]') ##msg = '""i;octet"" [""subject""] [""小木虫""] ,accounts :in_main [""2012207469@tju.edu.c' ##mtch = reg.match(msg) ##print(mtch.group(1))
总结
以上为个人经验,希望能给大家一个参考,也希望大家多多支持脚本之家。