Mëa Cúlpa's HideOut: Simple HTML cleaner

# -*- coding: utf-8 -*-
#!/usr/bin/env python
'''
Cleansing HTMLs
'''

import sys
from optparse import OptionParser
import re
import os

newline = re.compile(r'[\r\n]+')
tag = re.compile(r'^\s*<(/?)(\w+).*?(/?)>$', re.DOTALL)
sw = 2
noindent = ('br',)

def clean(filename):
    fo = open(filename, 'rb')
    try:
        data = fo.read()
    finally:
        fo.close()
    data = newline.sub('\n', data)
    data = data.replace('>', '>\n')
    data = data.replace('<', '\n<')
    data = data.splitlines()
    new = []
    for line in data:
        line = line.strip()
        if len(line):
            new.append(line)
    data = new
    new = []
    ilevel = 0
    for line in data:
        padding = ' ' * (sw * ilevel)
        line = padding + line
        try:
            close1, tagname, close2 = tag.search(line).groups()
            if close2 or tagname.lower() in noindent:
                pass
            elif close1:
                if ilevel:
                    ilevel -=1
                    padding = ' ' * (sw * ilevel)
                    line = padding + line.strip()
            else:
                ilevel += 1
        except:
            pass
        new.append(line)
    data = '\n'.join(new) + '\n'
    sys.stdout.write(data)


def main():
    op = OptionParser()
    opts, args = op.parse_args()
    for filename in args:
        clean(filename)

    return 0

if __name__ == '__main__':
    sys.exit(main())

Mëa Cúlpa's HideOut

Simple HTML cleaner

0 comments:

Post a Comment

Twitter Updates

New Blog

Search This Blog

Flickr Stream

Label Cloud

Blog Archive

Followers

My Blog List

Other links

In Twitter he follows...