Convert html file to text file

from bs4 import BeautifulSoup
import urllib2
import glob, os

os.chdir(“./margeBlockRastParam”)

#confirm = input(“[c]Confirm or [v]Void: “)
#    if confirm != ‘c’ and confirm != ‘v’:
cwd = os.getcwd()
print “*******************************************************”
print ” You are converting html to txt file under directory : \n”, cwd
print “*******************************************************”
confirm = raw_input(“[y]es or [n]o: “)
if confirm == ‘y’:
for file in sorted(glob.glob(“*.html”)):
filename, fileExtension = os.path.splitext(file)
print filename + fileExtension,
f = open(filename + fileExtension,’r’)
soup = BeautifulSoup(f, “lxml”)
#print soup.get_text()
f.close()

fw = open(“temp.txt”,’w’)
fw.write(soup.get_text().encode(‘utf-8′))
fw.close()

with open(“temp.txt”,’r’) as f:
fw = open(filename + “.txt”,’w+’)
# remove first line to print while it is html title
next(f)
for i in f:
if i !=”\n”:
#print i
fw.write(i)

fw.close()
f.close()
os.remove(“temp.txt”)
print  ‘->’, filename+’.txt’
print “complete !!!”
else:
print “Abroad……”

Please follow and like us: