package com.mzsx.xss; import java.util.HashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * a filter to remove unwanted elements or attributes in an html document * * @version 0.2.20120103 * @author Zhiji Gu <gu_zhiji@163.com> * @copyright © 2010-2012 InterBox Core 1.2 for Java, GuZhiji Studio */ final public class HTMLFilter { /** * a temporary memory for the reading fragment; * here it should store the value of the attribute and * it should be cleared after appended to the result */ private StringBuilder buffer; /** * a buffer for the final result */ private StringBuilder result; /** * status showing an open tag is being processed */ private boolean onOpenTag; /** * status showing a close tag is being processed * if true, don‘t accept any attributes */ private boolean onCloseTag; /** * status showing attributes of an element are being processed */ private boolean onAtt; /** * status showing a value of an attribute is being processed */ private boolean onAttValue; /** * for compression use, * if the previous character is a space, following * spaces will be omitted */ private boolean wasSpace; /** * type of the quotation mark that has been detected of the processing * attribute value * <ul> * <li>0 - xxx=xxxx or representing unknown if no value is buffered</li> * <li>1 - xxx=‘xxxx‘</li> * <li>2 - xxx="xxxx"</li> * </ul> */ private int quotType; /** * attribute name */ private String attName; /** * tag name */ private String tagName; /** * configuration for allowed html elements and attributes */ private HashMap<String, HashMap<String, Boolean>> html_config; /** * configuration for attributes where protocols appear */ private HashMap<String, Boolean> protocol_att; /** * configuration for allowed protocols */ private HashMap<String, Boolean> protocol_list; /** * the constructor */ public HTMLFilter() { html_config = new HashMap<String, HashMap<String, Boolean>>(); protocol_att = new HashMap<String, Boolean>(); protocol_list = new HashMap<String, Boolean>(); // default config HashMap<String, Boolean> attlist; attlist = new HashMap<String, Boolean>(); attlist.put("href", true); attlist.put("target", true); attlist.put("title", true); html_config.put("a", attlist); attlist = new HashMap<String, Boolean>(); attlist.put("src", true); attlist.put("width", true); attlist.put("height", true); attlist.put("border", true); attlist.put("alt", true); attlist.put("title", true); html_config.put("img", attlist); attlist = new HashMap<String, Boolean>(); attlist.put("border", true); attlist.put("width", true); attlist.put("height", true); html_config.put("table", attlist); html_config.put("th", new HashMap<String, Boolean>()); html_config.put("tr", new HashMap<String, Boolean>()); html_config.put("td", new HashMap<String, Boolean>()); html_config.put("br", new HashMap<String, Boolean>()); html_config.put("p", new HashMap<String, Boolean>()); html_config.put("b", new HashMap<String, Boolean>()); html_config.put("i", new HashMap<String, Boolean>()); html_config.put("strong", new HashMap<String, Boolean>()); html_config.put("em", new HashMap<String, Boolean>()); html_config.put("h1", new HashMap<String, Boolean>()); html_config.put("h2", new HashMap<String, Boolean>()); html_config.put("h3", new HashMap<String, Boolean>()); html_config.put("h4", new HashMap<String, Boolean>()); html_config.put("h5", new HashMap<String, Boolean>()); html_config.put("h6", new HashMap<String, Boolean>()); attlist = new HashMap<String, Boolean>(); attlist.put("face", true); attlist.put("size", true); attlist.put("color", true); html_config.put("font", attlist); protocol_att.put("href", true); protocol_att.put("src", true); protocol_list.put("http", true); protocol_list.put("https", true); protocol_list.put("ftp", true); protocol_list.put("mailto", true); } /* * public HTMLFilter(HTMLFilter_Config configobj) { * html_config = configobj.html_config; * protocol_att = configobj.protocol_att; * protocol_list = configobj.protocol_list; * } */ /** * convert html entity numbers into characters they represent * * note that this method is still dependent on regular expression * * @param String s html with entity numbers * @return String html without entity numbers */ private String decodeHTMLEntities(String html) { try { StringBuffer str = new StringBuffer(); //regular expression to capture html entity numbers //e.g. : or : or : or : or : or : //The ones without semi-colon are not to the standard //but work on browsers. Pattern p = Pattern.compile("&#([0-9a-fA-FXx]+);?"); Matcher m = p.matcher(html); int ch; while (m.find()) { String match = m.group(1); if (match.startsWith("x")) { //hex //0xnnnn ch = Integer.decode("0" + match).intValue(); } else { //dec if (match.startsWith("0")) { //0000058 //redundant ‘0‘s cause problems int l = match.length(); int i; for (i = 1; i < l; i++) { if (match.charAt(i) != ‘0‘) { break; } } if (i < l) { //remove ‘0‘s match = match.substring(i, l); } else { //0000000 match = "0"; } } ch = Integer.decode(match).intValue(); } //replace the html entity number with its character m.appendReplacement(str, String.valueOf((char) ch)); } m.appendTail(str); return str.toString(); } catch (Exception e) { //possibly syntax error //simply return an empty string return ""; } } /** * remove a specified character on both sides * * @param String value string to be trimed * @param char ch character to remove if exists on either side of value */ private String trim(String value, char ch) { int pos, pos2; value = value.trim();//remove spaces String str = String.valueOf(ch);//convert to string if (value.startsWith(str)) { pos = 1;//omit the first character } else { pos = 0; } if (value.endsWith(str)) { pos2 = value.length() - 1;//omit the last character } else { pos2 = value.length(); } return value.substring(pos, pos2); } /** * append the bufferd attribute value to the result * * when onopentag is false (end of the tag has been reached), * the tag is automatically closed by ">"; * when both onopentag and onclosetag are true (empty element), * a "/" is appended */ private void appendAttValue() { if (!tagName.isEmpty()) {//if the element is allowed if (!attName.isEmpty()) {//if the attribute is allowed //format the buffered value String value = buffer.toString().trim(); if (quotType == 2) { value = trim(value, ‘"‘); } else if (quotType == 1) { value = trim(value, ‘\‘‘); } //check the attribute name for protocols if (protocol_att.containsKey(attName)) { //decode the value for html entities value = decodeHTMLEntities(value); // int pos = value.indexOf("&#"); // if (pos != -1) { // value = ""; // } else { //read the protocol name int pos = value.indexOf(":"); if (pos > 0) { String protocol = value.substring(0, pos).toLowerCase(); // remove invalid chars // e.g. java\tscript:, java script: // Pattern p = Pattern.compile("[^a-zA-Z]"); // Matcher m = p.matcher(protocol); // m.replaceAll(""); //check for validity if (!protocol_list.containsKey(protocol)) { value = ""; } } // } //encode html entities value = value.replaceAll("&", "&"); value = value.replaceAll("\"", """); value = value.replaceAll("<", "<"); value = value.replaceAll(">", ">"); } // append value result.append("\"").append(value).append("\""); } //the element is allowed //so close the tag if necessary if (!onOpenTag) { result.append(‘>‘);//close the opening tag onAtt = false; } else if (onCloseTag) { result.append(" /");//close by half, wait until a ">" is read onAtt = false; } } else { //if the element is not allowed if (!onOpenTag || onCloseTag) { //">" and "/" become unnecessary //the tag is closed //and the attributes are no longer read onAtt = false; } } buffer.delete(0, buffer.length()); onAttValue = false; quotType = 0; } /** * append the bufferd attribute name to the result * * when onopentag is false (end of the tag has been reached), the tag is * automatically closed by ">"; * when both onopentag and onclosetag are true (empty element), a "/" * is appended; * in both cases above, the tag is closed or closed by half before * an attribute value is read, therefore, according to the standards, * a value is generated, * e.g. * <input type="radio" name="id" checked /> * ==> * <input type="radio" name="id" * checked="checked"/>; * if the tag is not closed, set onattvalue TRUE to start reading * attribute value */ private void appendAttName() { if (!tagName.isEmpty()) {//if the element is allowed //format buffered attribute name String aname = buffer.toString().trim().toLowerCase(); //validate the attribute name if (html_config.get(tagName).containsKey(aname)) { attName = aname; //append attribute name result.append(‘ ‘).append(aname).append(‘=‘); if (!onOpenTag || onCloseTag) { result.append(‘"‘).append(aname).append(‘"‘); } } else { //if the attribute is not allowed attName = ""; } if (!onOpenTag) {//close the opening tag result.append(‘>‘); onAttValue = false; onAtt = false; } else if (onCloseTag) {//close the opening tag by half result.append(" /"); onAttValue = false; onAtt = false; } else {//start reading attribute value onAttValue = true; } } buffer.delete(0, buffer.length()); } /** * append the bufferd tag name to the result * * when onopentag is false (end of the tag has been reached), the tag is * automatically closed by ">"; * when both onopentag and onclosetag are true (empty element), a "/" * is appended; * otherwise, set onatt TRUE to start reading attributes */ private void appendTagName() { //format buffered tag/element name String tname = buffer.toString().trim().toLowerCase(); //validate the element if (html_config.containsKey(tname)) { tagName = tname; //append the tag name result.append("<").append(tname); if (!onOpenTag) {//close the opening tag result.append(‘>‘); } else if (onCloseTag) {//close the opening tag by half result.append(" /"); } else {//start reading attributes onAtt = true; } } else { //if the element is not allowed tagName = ""; if (onOpenTag && !onCloseTag) { //start reading attributes //but will not append them to the result //only change status by doing so onAtt = true; } } buffer.delete(0, buffer.length()); } /** * check if buffer is empty * @return boolean */ private boolean isBufferEmpty() { return buffer.toString().trim().isEmpty(); } private void processOnAttValue(char current) { switch (current) { case ‘ ‘: if (quotType == 0 && !isBufferEmpty()) { // end reading the attribute value // xxx=xxxx xxx=xxxx // ^ appendAttValue(); } else {// read attribute value // xxx=" xxxx" // ^ // or // xxx="xx xx" // ^ // or // xxx= xxxx // ^ buffer.append(current); } break; case ‘>‘:// strict: end reading the attribute and the tag // "xxxx>" // ^ supposed to be > // or // xxx=xxxx> // ^ onOpenTag = false; appendAttValue(); break; case ‘\‘‘: if (quotType == 1) { // end reading the attribute value // ‘xxxx‘ // ^ appendAttValue(); } else if (quotType == 0 && isBufferEmpty()) { // ‘xxxx‘ // ^ quotType = 1; } else { // "xx‘xx" // ^ buffer.append(current); } break; case ‘"‘: if (quotType == 2) { // end reading the attribute value // "xxxx" // ^ appendAttValue(); } else if (quotType == 0 && isBufferEmpty()) { // "xxxx" // ^ quotType = 2; } else { // ‘xx"xx‘ // ^ buffer.append(current); } break; case ‘/‘: if (quotType > 0) { // xxx="xx/x" // ^ buffer.append(current); } else { // xxx=xxxx/> // ^ onCloseTag = true; appendAttValue(); } break; default:// read attribute value buffer.append(current); } } private void processOnCloseTag(char current) { if (current == ‘>‘) { // end reading the ending tag if (onOpenTag) { // <xxx /> // ^ if (!tagName.isEmpty()) { result.append(‘>‘); } onOpenTag = false; } else { // </xxx> // ^ String _tagName = buffer.toString().trim().toLowerCase(); if (html_config.containsKey(_tagName)) { result.append("</").append(_tagName).append(">"); } buffer.delete(0, buffer.length()); } onCloseTag = false; } else if (!onOpenTag) { // read the ending tag // </xxx> switch (current) {// skip invalid chars case ‘/‘:// <//xxx> case ‘"‘:// </xx"xx> case ‘<‘:// </xx<xxx> case ‘\r‘: case ‘\n‘: case ‘\t‘: case ‘ ‘:// </ xx > break; default: buffer.append(current); } } // else do nothing // <xxx / > // ^ } private void processRestAtt(char current) { switch (current) { case ‘ ‘: case ‘\r‘: case ‘\n‘: case ‘\t‘: if (!isBufferEmpty()) { // accept attribute name // xxx = "xxxx" // ^ appendAttName(); } // else do nothing // <xxx > // ^ // or // <xxx xxx // ^ break; case ‘>‘:// end reading the tag onOpenTag = false; if (!isBufferEmpty()) { // <xxx xxx> // ^ // add default value // <xxx xxx="xxx"> appendAttName(); } else { // <xxx > // ^ onAtt = false; if (!tagName.isEmpty()) { result.append(‘>‘); } } break; case ‘=‘:// start reading attribute value // xxx = "xxxx" // ^ onAttValue = true; appendAttName(); break; case ‘/‘: onCloseTag = true; if (!isBufferEmpty()) { // <xxx xxx/> // ^ appendAttName(); } else { // <xxx /> // ^ onAtt = false; } break; default:// read attribute name buffer.append(current); } } private void processRestOpenTag(char current) { switch (current) { case ‘ ‘: case ‘\r‘: case ‘\n‘: case ‘\t‘: if (!isBufferEmpty()) { // <xxx xxx="xxxx"> // ^ appendTagName(); } // else do nothing // < xxx xxx="xxxx"> // ^ break; case ‘/‘: onCloseTag = true; if (isBufferEmpty()) { // </xxx> // ^ onOpenTag = false; } else { // <xxx/> // ^ appendTagName(); } break; case ‘>‘: // end the starting tag onOpenTag = false; if (!isBufferEmpty()) { // <xxx> // ^ appendTagName(); } // else do nothing // <> // ^ break; default: // start reading tag name // <xxx xxx="xxxx"> // ^ buffer.append(current); } } private void processRest(char current) { switch (current) { case ‘ ‘: case ‘\r‘: case ‘\n‘: case ‘\t‘: if (!wasSpace) { result.append(‘ ‘); wasSpace = true; } break; case ‘<‘: // <xxx> // ^ onOpenTag = true; wasSpace = false; break; default: result.append(current); wasSpace = false; } } /** * filter the input html * * @param html HTML to be processed * @return String */ public String filter(String html) { // initialize buffer = new StringBuilder(); result = new StringBuilder(); onOpenTag = false; onCloseTag = false; onAtt = false; onAttValue = false; wasSpace = false; quotType = 0; attName = ""; tagName = ""; //iterate characters in the input html // char[] allChars = html.toCharArray(); // for (char ch : allChars) { char ch; int l = html.length(); for (int i = 0; i < l; i++) { ch = html.charAt(i); if (onCloseTag) { // </xxx> or .../> processOnCloseTag(ch); } else if (onOpenTag) { // <xxx xxx="xxxx"> if (onAttValue) { // ..."xxxx" processOnAttValue(ch); } else if (onAtt) { // ... xxx="xxxx" xxx="xxxx" processRestAtt(ch); } else { //<xxx ... processRestOpenTag(ch); } } else { // </xxx1>...<xxx2> processRest(ch); } } return result.toString(); } }
原文:http://qiangmzsx.blog.51cto.com/2052549/1359630