def
remove_js_css (content): """ remove the the javascript and the stylesheet and the comment content (<script>....</script> and <style>....</style> <!-- xxx -->) """ r =
re. compile (r ‘‘‘<script.*?</script>‘‘‘ ,re.I|re.M|re.S) s =
r.sub (‘‘,content) r =
re. compile (r ‘‘‘<style.*?</style>‘‘‘ ,re.I|re.M|re.S) s =
r.sub (‘‘, s) r =
re. compile (r ‘‘‘<!--.*?-->‘‘‘ , re.I|re.M|re.S) s =
r.sub(‘‘,s) r =
re. compile (r ‘‘‘<meta.*?>‘‘‘ , re.I|re.M|re.S) s =
r.sub(‘‘,s) r =
re. compile (r ‘‘‘<ins.*?</ins>‘‘‘ , re.I|re.M|re.S) s =
r.sub(‘‘,s) return
s |
def
remove_empty_line (content): """remove multi space """ r =
re. compile (r ‘‘‘^\s+$‘‘‘ , re.M|re.S) s =
r.sub (‘‘, content) r =
re. compile (r ‘‘‘\n+‘‘‘ ,re.M|re.S) s =
r.sub( ‘\n‘ ,s) return
s |
def
remove_any_tag (s): s =
re.sub(r ‘‘‘<[^>]+>‘‘‘ ,‘‘,s) return
s.strip() def
remove_any_tag_but_a (s): text =
re.findall (r ‘‘‘<a[^r][^>]*>(.*?)</a>‘‘‘ ,s,re.I|re.S|re.S) text_b =
remove_any_tag (s) return
len (‘‘.join(text)), len (text_b) def
remove_image (s,n = 50 ): image =
‘a‘
*
n r =
re. compile
(r ‘‘‘<img.*?>‘‘‘ ,re.I|re.M|re.S) s =
r.sub(image,s) return
s def
remove_video (s,n = 1000 ): video =
‘a‘
*
n r =
re. compile
(r ‘‘‘<embed.*?>‘‘‘ ,re.I|re.M|re.S) s =
r.sub(video,s) return
s |
原文:http://www.cnblogs.com/lizunicon/p/3516561.html