首页 > 开发 > Java > 正文

将HTML转化为TEXT的Java类

2024-07-13 09:55:32
字体:
来源:转载
供稿:网友

  为了支持全文检索,有必要将html格式的文章转化为纯文本格式,因此我设计了一个基本的webformatter类,提供一个简单的public static string html2text(string html),将html格式转化为text:

/*
 * file: webformatter.java
 * created on 2005-6-24
 * author: liao xuefeng, [email protected]
 * copyright (c) 2005, liao xuefeng.
 */
package com.mboker.blog.web.util;

import java.util.*;
import java.text.simpledateformat;

/**
 * do some format on web display.
 *
 * @author xuefeng
 */
public class webformatter {

    public static string html2text(string html) {
        stringbuffer sb = new stringbuffer(html.length());
        char[] data = html.tochararray();
        int start = 0;
        boolean previousispre = false;
        token token = null;
        for(;;) {
            token = parse(data, start, previousispre);
            if(token==null)
                break;
            previousispre = token.ispretag();
            sb = sb.append(token.gettext());
            start += token.getlength();
        }
        return sb.tostring();
    }

    private static token parse(char[] data, int start, boolean previousispre) {
        if(start>=data.length)
            return null;
        // try to read next char:
        char c = data[start];
        if(c=='<') {
            // this is a tag or comment or script:
            int end_index = indexof(data, start+1, '>');
            if(end_index==(-1)) {
                // the left is all text!
                return new token(token.token_text, data, start, data.length, previousispre);
            }
            string s = new string(data, start, end_index-start+1);
            // now we got s="<...>":
            if(s.startswith("<!--")) { // this is a comment!
                int end_comment_index = indexof(data, start+1, "-->");
                if(end_comment_index==(-1)) {
                    // illegal end, but treat as comment:
                    return new token(token.token_comment, data, start, data.length, previousispre);
                }
                else
                    return new token(token.token_comment, data, start, end_comment_index+3, previousispre);
            }
            string s_lowercase = s.tolowercase();
            if(s_lowercase.startswith("<script")) { // this is a script:
                int end_script_index = indexof(data, start+1, "</script>");
                if(end_script_index==(-1))
                    // illegal end, but treat as script:
                    return new token(token.token_script, data, start, data.length, previousispre);
                else
                    return new token(token.token_script, data, start, end_script_index+9, previousispre);
            }
            else { // this is a tag:
                return new token(token.token_tag, data, start, start+s.length(), previousispre);
            }
        }
        // this is a text:
        int next_tag_index = indexof(data, start+1, '<');
        if(next_tag_index==(-1))
            return new token(token.token_text, data, start, data.length, previousispre);
        return new token(token.token_text, data, start, next_tag_index, previousispre);
    }

    private static int indexof(char[] data, int start, string s) {
        char[] ss = s.tochararray();
        // todo: performance can improve!
        for(int i=start; i<(data.length-ss.length); i++) {
            // compare from data[i] with ss[0]:
            boolean match = true;
            for(int j=0; j<ss.length; j++) {
                if(data[i+j]!=ss[j]) {
                    match = false;
                    break;
                }
            }
            if(match)
                return i;
        }
        return (-1);
    }

    private static int indexof(char[] data, int start, char c) {
        for(int i=start; i<data.length; i++) {
            if(data[i]==c)
                return i;
        }
        return (-1);
    }

}

class token {

    public static final int token_text    = 0; // html text.
    public static final int token_comment = 1; // comment like <!-- comments... -->
    public static final int token_tag     = 2; // tag like <pre>, <font>, etc.
    public static final int token_script  = 3;

    private static final char[] tag_br  = "<br".tochararray();
    private static final char[] tag_p   = "<p".tochararray();
    private static final char[] tag_li  = "<li".tochararray();
    private static final char[] tag_pre = "<pre".tochararray();
    private static final char[] tag_hr  = "<hr".tochararray();

    private static final char[] end_tag_td = "</td>".tochararray();
    private static final char[] end_tag_tr = "</tr>".tochararray();
    private static final char[] end_tag_li = "</li>".tochararray();

    private static final map special_chars = new hashmap();

    private int type;
    private string html;           // original html
    private string text = null;    // text!
    private int length = 0;        // html length
    private boolean ispre = false; // ispre tag?

    static {
        special_chars.put("&quot;", "/"");
        special_chars.put("&lt;",   "<");
        special_chars.put("&gt;",   ">");
        special_chars.put("&amp;",  "&");
        special_chars.put("&reg;",  "(r)");
        special_chars.put("&copy;", "(c)");
        special_chars.put("&nbsp;", " ");
        special_chars.put("&pound;", "?");
    }

    public token(int type, char[] data, int start, int end, boolean previousispre) {
        this.type = type;
        this.length = end - start;
        this.html = new string(data, start, length);
        system.out.println("[token] html=" + html + ".");
        parsetext(previousispre);
        system.out.println("[token] text=" + text + ".");
    }

    public int getlength() {
        return length;
    }

    public boolean ispretag() {
        return ispre;
    }

    private void parsetext(boolean previousispre) {
        if(type==token_tag) {
            char[] cs = html.tochararray();
            if(comparetag(tag_br, cs) || comparetag(tag_p, cs))
                text = "/n";
            else if(comparetag(tag_li, cs))
                text = "/n* ";
            else if(comparetag(tag_pre, cs))
                ispre = true;
            else if(comparetag(tag_hr, cs))
                text = "/n--------/n";
            else if(comparestring(end_tag_td, cs))
                text = "/t";
            else if(comparestring(end_tag_tr, cs) || comparestring(end_tag_li, cs))
                text = "/n";
        }
        // text token:
        else if(type==token_text) {
            text = totext(html, previousispre);
        }
    }

    public string gettext() {
        return text==null ? "" : text;
    }

    private string totext(string html, final boolean ispre) {
        char[] cs = html.tochararray();
        stringbuffer buffer = new stringbuffer(cs.length);
        int start = 0;
        boolean continuespace = false;
        char current, next;
        for(;;) {
            if(start>=cs.length)
                break;
            current = cs[start]; // read current char
            if(start+1<cs.length) // and next char
                next = cs[start+1];
            else
                next = '/0';
            if(current==' ') {
                if(ispre || !continuespace)
                    buffer = buffer.append(' ');
                continuespace = true;
                // continue loop:
                start++;
                continue;
            }
            // not ' ', so:
            if(current=='/r' && next=='/n') {
                if(ispre)
                    buffer = buffer.append('/n');
                // continue loop:
                start+=2;
                continue;
            }
            if(current=='/n' || current=='/r') {
                if(ispre)
                    buffer = buffer.append('/n');
                // continue loop:
                start++;
                continue;
            }
            // cannot continue space:
            continuespace = false;
            if(current=='&') {
                // maybe special char:
                int length = readutil(cs, start, ';', 10);
                if(length==(-1)) { // just '&':
                    buffer = buffer.append('&');
                    // continue loop:
                    start++;
                    continue;
                }
                else { // check if special character:
                    string spec = new string(cs, start, length);
                    string specchar = (string)special_chars.get(spec);
                    if(specchar!=null) { // special chars!
                        buffer = buffer.append(specchar);
                        // continue loop:
                        start+=length;
                        continue;
                    }
                    else { // check if like '&#1234':
                        if(next=='#') { // maybe a char
                            string num = new string(cs, start+2, length-3);
                            try {
                                int code = integer.parseint(num);
                                if(code>0 && code<65536) { // this is a special char:
                                    buffer = buffer.append((char)code);
                                    // continue loop:
                                    start++;
                                    continue;
                                }
                            }
                            catch(exception e) {}
                            // just normal char:
                            buffer = buffer.append("&#");
                            // continue loop:
                            start+=2;
                            continue;
                        }
                        else { // just '&':
                            buffer = buffer.append('&');
                            // continue loop:
                            start++;
                            continue;
                        }
                    }
                }
            }
            else { // just a normal char!
                buffer = buffer.append(current);
                // continue loop:
                start++;
                continue;
            }
        }
        return buffer.tostring();
    }

    // read from cs[start] util meet the specified char 'util',
    // or null if not found:
    private int readutil(final char[] cs, final int start, final char util, final int maxlength) {
        int end = start+maxlength;
        if(end>cs.length)
            end = cs.length;
        for(int i=start; i<start+maxlength; i++) {
            if(cs[i]==util) {
                return i-start+1;
            }
        }
        return (-1);
    }

    // compare standard tag "<input" with tag "<input value=aa>"
    private boolean comparetag(final char[] ori_tag, char[] tag) {
        if(ori_tag.length>=tag.length)
            return false;
        for(int i=0; i<ori_tag.length; i++) {
            if(character.tolowercase(tag[i])!=ori_tag[i])
                return false;
        }
        // the following char should not be a-z:
        if(tag.length>ori_tag.length) {
            char c = character.tolowercase(tag[ori_tag.length]);
            if(c<'a' || c>'z')
                return true;
            return false;
        }
        return true;
    }

    private boolean comparestring(final char[] ori, char[] comp) {
        if(ori.length>comp.length)
            return false;
        for(int i=0; i<ori.length; i++) {
            if(character.tolowercase(comp[i])!=ori[i])
                return false;
        }
        return true;
    }

    public string tostring() {
        return html;
    }
}

  注意,请先将html中的<body>...</body>部分提取出来,再交给webformatter处理,因为html->text转换实质是删除所有标签(某些标签如<br>被转化为'/n')、script和注释,对于javascript生成的动态内容(例如document.write)无能为力。

,欢迎访问网页设计爱好者web开发。
发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表