为了支持全文检索,有必要将html格式的文章转化为纯文本格式,因此我设计了一个基本的webformatter类,提供一个简单的public static string html2text(string html),将html格式转化为text:
/*
* file: webformatter.java
* created on 2005-6-24
* author: liao xuefeng, [email protected]
* copyright (c) 2005, liao xuefeng.
*/
package com.mboker.blog.web.util;
import java.util.*;
import java.text.simpledateformat;
/**
* do some format on web display.
*
* @author xuefeng
*/
public class webformatter {
public static string html2text(string html) {
stringbuffer sb = new stringbuffer(html.length());
char[] data = html.tochararray();
int start = 0;
boolean previousispre = false;
token token = null;
for(;;) {
token = parse(data, start, previousispre);
if(token==null)
break;
previousispre = token.ispretag();
sb = sb.append(token.gettext());
start += token.getlength();
}
return sb.tostring();
}
private static token parse(char[] data, int start, boolean previousispre) {
if(start>=data.length)
return null;
// try to read next char:
char c = data[start];
if(c=='<') {
// this is a tag or comment or script:
int end_index = indexof(data, start+1, '>');
if(end_index==(-1)) {
// the left is all text!
return new token(token.token_text, data, start, data.length, previousispre);
}
string s = new string(data, start, end_index-start+1);
// now we got s="<...>":
if(s.startswith("<!--")) { // this is a comment!
int end_comment_index = indexof(data, start+1, "-->");
if(end_comment_index==(-1)) {
// illegal end, but treat as comment:
return new token(token.token_comment, data, start, data.length, previousispre);
}
else
return new token(token.token_comment, data, start, end_comment_index+3, previousispre);
}
string s_lowercase = s.tolowercase();
if(s_lowercase.startswith("<script")) { // this is a script:
int end_script_index = indexof(data, start+1, "</script>");
if(end_script_index==(-1))
// illegal end, but treat as script:
return new token(token.token_script, data, start, data.length, previousispre);
else
return new token(token.token_script, data, start, end_script_index+9, previousispre);
}
else { // this is a tag:
return new token(token.token_tag, data, start, start+s.length(), previousispre);
}
}
// this is a text:
int next_tag_index = indexof(data, start+1, '<');
if(next_tag_index==(-1))
return new token(token.token_text, data, start, data.length, previousispre);
return new token(token.token_text, data, start, next_tag_index, previousispre);
}
private static int indexof(char[] data, int start, string s) {
char[] ss = s.tochararray();
// todo: performance can improve!
for(int i=start; i<(data.length-ss.length); i++) {
// compare from data[i] with ss[0]:
boolean match = true;
for(int j=0; j<ss.length; j++) {
if(data[i+j]!=ss[j]) {
match = false;
break;
}
}
if(match)
return i;
}
return (-1);
}
private static int indexof(char[] data, int start, char c) {
for(int i=start; i<data.length; i++) {
if(data[i]==c)
return i;
}
return (-1);
}
}
class token {
public static final int token_text = 0; // html text.
public static final int token_comment = 1; // comment like <!-- comments... -->
public static final int token_tag = 2; // tag like <pre>, <font>, etc.
public static final int token_script = 3;
private static final char[] tag_br = "<br".tochararray();
private static final char[] tag_p = "<p".tochararray();
private static final char[] tag_li = "<li".tochararray();
private static final char[] tag_pre = "<pre".tochararray();
private static final char[] tag_hr = "<hr".tochararray();
private static final char[] end_tag_td = "</td>".tochararray();
private static final char[] end_tag_tr = "</tr>".tochararray();
private static final char[] end_tag_li = "</li>".tochararray();
private static final map special_chars = new hashmap();
private int type;
private string html; // original html
private string text = null; // text!
private int length = 0; // html length
private boolean ispre = false; // ispre tag?
static {
special_chars.put(""", "/"");
special_chars.put("<", "<");
special_chars.put(">", ">");
special_chars.put("&", "&");
special_chars.put("®", "(r)");
special_chars.put("©", "(c)");
special_chars.put(" ", " ");
special_chars.put("£", "?");
}
public token(int type, char[] data, int start, int end, boolean previousispre) {
this.type = type;
this.length = end - start;
this.html = new string(data, start, length);
system.out.println("[token] html=" + html + ".");
parsetext(previousispre);
system.out.println("[token] text=" + text + ".");
}
public int getlength() {
return length;
}
public boolean ispretag() {
return ispre;
}
private void parsetext(boolean previousispre) {
if(type==token_tag) {
char[] cs = html.tochararray();
if(comparetag(tag_br, cs) || comparetag(tag_p, cs))
text = "/n";
else if(comparetag(tag_li, cs))
text = "/n* ";
else if(comparetag(tag_pre, cs))
ispre = true;
else if(comparetag(tag_hr, cs))
text = "/n--------/n";
else if(comparestring(end_tag_td, cs))
text = "/t";
else if(comparestring(end_tag_tr, cs) || comparestring(end_tag_li, cs))
text = "/n";
}
// text token:
else if(type==token_text) {
text = totext(html, previousispre);
}
}
public string gettext() {
return text==null ? "" : text;
}
private string totext(string html, final boolean ispre) {
char[] cs = html.tochararray();
stringbuffer buffer = new stringbuffer(cs.length);
int start = 0;
boolean continuespace = false;
char current, next;
for(;;) {
if(start>=cs.length)
break;
current = cs[start]; // read current char
if(start+1<cs.length) // and next char
next = cs[start+1];
else
next = '/0';
if(current==' ') {
if(ispre || !continuespace)
buffer = buffer.append(' ');
continuespace = true;
// continue loop:
start++;
continue;
}
// not ' ', so:
if(current=='/r' && next=='/n') {
if(ispre)
buffer = buffer.append('/n');
// continue loop:
start+=2;
continue;
}
if(current=='/n' || current=='/r') {
if(ispre)
buffer = buffer.append('/n');
// continue loop:
start++;
continue;
}
// cannot continue space:
continuespace = false;
if(current=='&') {
// maybe special char:
int length = readutil(cs, start, ';', 10);
if(length==(-1)) { // just '&':
buffer = buffer.append('&');
// continue loop:
start++;
continue;
}
else { // check if special character:
string spec = new string(cs, start, length);
string specchar = (string)special_chars.get(spec);
if(specchar!=null) { // special chars!
buffer = buffer.append(specchar);
// continue loop:
start+=length;
continue;
}
else { // check if like 'Ӓ':
if(next=='#') { // maybe a char
string num = new string(cs, start+2, length-3);
try {
int code = integer.parseint(num);
if(code>0 && code<65536) { // this is a special char:
buffer = buffer.append((char)code);
// continue loop:
start++;
continue;
}
}
catch(exception e) {}
// just normal char:
buffer = buffer.append("&#");
// continue loop:
start+=2;
continue;
}
else { // just '&':
buffer = buffer.append('&');
// continue loop:
start++;
continue;
}
}
}
}
else { // just a normal char!
buffer = buffer.append(current);
// continue loop:
start++;
continue;
}
}
return buffer.tostring();
}
// read from cs[start] util meet the specified char 'util',
// or null if not found:
private int readutil(final char[] cs, final int start, final char util, final int maxlength) {
int end = start+maxlength;
if(end>cs.length)
end = cs.length;
for(int i=start; i<start+maxlength; i++) {
if(cs[i]==util) {
return i-start+1;
}
}
return (-1);
}
// compare standard tag "<input" with tag "<input value=aa>"
private boolean comparetag(final char[] ori_tag, char[] tag) {
if(ori_tag.length>=tag.length)
return false;
for(int i=0; i<ori_tag.length; i++) {
if(character.tolowercase(tag[i])!=ori_tag[i])
return false;
}
// the following char should not be a-z:
if(tag.length>ori_tag.length) {
char c = character.tolowercase(tag[ori_tag.length]);
if(c<'a' || c>'z')
return true;
return false;
}
return true;
}
private boolean comparestring(final char[] ori, char[] comp) {
if(ori.length>comp.length)
return false;
for(int i=0; i<ori.length; i++) {
if(character.tolowercase(comp[i])!=ori[i])
return false;
}
return true;
}
public string tostring() {
return html;
}
}
注意,请先将html中的<body>...</body>部分提取出来,再交给webformatter处理,因为html->text转换实质是删除所有标签(某些标签如<br>被转化为'/n')、script和注释,对于javascript生成的动态内容(例如document.write)无能为力。
,欢迎访问网页设计爱好者web开发。新闻热点
疑难解答
图片精选