2017-11-08

Java中處理HTML Entity (HTML Special Char)

幫同事處理資料庫中有HTML Entity的字元,將其轉為對應的unicode字元,
(猜測是舊系統的網頁編碼為UTF-8,但送出表單時的charset為BIG5,因無法找到對應字元所造成的)

String name = "王蘐老蘑先蘒生蘓王蘔老蘕先蘖生蘗王蘘老蘐先蘙生蘚";
        Pattern regex = Pattern.compile("&#(\\d{5});");
        StringBuffer sb = new StringBuffer();
        
        long time1 = System.currentTimeMillis();
        
        //使用HTMLDocument及HTMLEditorKit
        HTMLDocument doc = new HTMLDocument();
        new HTMLEditorKit().read(new StringReader(name), doc, 0);
        System.out.println(doc.getText(0, doc.getLength()));
        
        long time2 = System.currentTimeMillis();
        System.out.println(time2 - time1); //60毫秒
        
        //使用正則表示式
        Matcher match = regex.matcher(name);
        while(match.find()){
            match.appendReplacement(sb, new String(Character.toChars(Integer.parseInt(match.group(1)))));
        }
        match.appendTail(sb);
        System.out.println(sb);
        
        long time3 = System.currentTimeMillis();
        System.out.println(time3 - time2);//1毫秒