import java.io.*; import java.lang.String; class HtmlTokenizer extends StreamTokenizer { //定义各标记,这里的标记仅是本例中必须的, 可根据需要自行扩充 static int HTML_TEXT=-1; static int HTML_UNKNOWN=-2; static int HTML_EOF=-3; static int HTML_IMAGE=-4; static int HTML_FRAME=-5; static int HTML_BACKGROUND=-6; static int HTML_APPLET=-7;
boolean outsideTag=true; //判定是否在标记之中
//构造器,定义该令牌流的语法表。 public HtmlTokenizer(BufferedReader r) { super(r); this.resetSyntax(); //重置语法表 this.wordChars(0,255); //令牌范围为全部字符 this.ordinaryChar(´< ´); //HTML标记两边的分割符 this.ordinaryChar(´>´); } //end of constrUCtor
public int nextHtml(){ int token; //令牌 try{ switch(token=this.nextToken()){ case StreamTokenizer.TT_EOF: //假如已读到流的尽头,则返回TT_EOF return HTML_EOF; case ´< ´: //进入标记字段 outsideTag=false; return nextHtml(); case ´>´: //出标记字段 outsideTag=true; return nextHtml(); case StreamTokenizer.TT_WORD: //若当前令牌为单词,判定是哪个标记 if (allWhite(sval)) return nextHtml(); //过滤其中空格 else if(sval.toUpperCase().indexOf("FRAME") !=-1 && !outsideTag) //标记FRAME return HTML_FRAME; else if(sval.toUpperCase().indexOf("IMG") !=-1 && !outsideTag) //标记IMG return HTML_IMAGE; else if(sval.toUpperCase().indexOf("BACKGROUND") !=-1 && !outsideTag) //标记BACKGROUND return HTML_BACKGROUND; else if(sval.toUpperCase().indexOf("APPLET") !=-1 && !outsideTag) //标记APPLET return HTML_APPLET; default: System.out.PRintln ("Unknown tag: "+token); return HTML_UNKNOWN; } //end of case }catch(IOException e){ System.out.println("Error:"+e.getMessage());} return HTML_UNKNOWN; } //end of nextHtml
protected boolean allWhite(String s){//过滤所有空格 //实现略 }// end of allWhite
//变量theUrl表示该HTML页面的位置 if(theUrl.toUpperCase().endsWith("HTML") theUrl.toUpperCase().endsWith("HTM")){ try{ fin=new FileInputStream(theCurrentPath+filename); //读入该HTML文件 in=new InputStreamReader(fin); reader=new BufferedReader(in); }catch(FileNotFoundException e){ System.out.println(e.getMessage());} HtmlStream=new HtmlTokenizer(reader); //生成类HtmlTokenizer的实例 //读该令牌流,并做分析 while((tagType=HtmlStream.nextHtml()) !=HtmlStream.HTML_EOF){ if(tagType==HtmlStream.HTML_IMAGE){ // 标记< IMG > st=new StringTokenizer(HtmlStream.sval," "); //sval即"< IMG"之后直到下一标记之间的字符 while(st.hasMoreTokens()){ tmp=st.nextToken(); if(tmp.toUpperCase().trim().startsWith("SRC")){ tmpPath=tmp.substring(tmp.indexOf(´"´)+1, tmp.lastIndexOf(´/´)); //取该图形文件存放的相对目录 tmpFileName=tmpPath.substring (tmpPath.indexOf(´/´)+1); //取文件名 catchHtmlFile(theUrl,tmpPath,tmpFileName); //下载该文件 } //end of if } //end of while } //end of if else if(tagType==HtmlStream.HTML_FRAME){ // 处理同上,取出各框架中的HTML文件名,分别下载。 } //end of else if else if(tagType==HtmlStream.HTML_BACKGROUND){ //略 } //end of the first while }//end of first if }//end of getIt