2010年3月25日 星期四

JAVA 版的EncodeURI (RFC 1738)

為什麼Java的UrlEncoder.encode, UrlDecoder.decode和javascript的不一樣!!!!



最近因為一個工作需求,需要作encode, decode uri的動作,之前這個工作是由Server來處理,所以我其實不需要動到,原本我這邊只會用到Java而已


原本都是用Java版的UrlEncoder與UrlDecoder


結果發現非常完全不同


 


查了一下,才發現在JavaScript 等網路程式語言的Encode有分好幾種


最基本的Encode, encodeUrl encodeUrlCompoment等~


 


java中的encoder只有「最完整」的那一種而已。


查了很久後,也只有發現有人有寫encode程式碼而已(感謝apache)


於是就乾脆自己把decode path寫出來


(反正都有encode了,要寫decode也不是很難)


 


不過測試了一下之後,對於中文是無法成功轉回來的。於是只好強制轉回來顯示為utf-8, 這樣子中文問題雖然解決了


但是還是有一些特殊字元無法被轉回來(不屬於utf-8的字元)


 


也就是說:同樣的一個url,encode再decode過後的結果不一定會相同Orz...怎麼會這樣子~


字元語系真是麻煩啊~


 


所以我最後我又多寫了一個url


當它為合法的url path時,就不要再進行任何轉換了


 


於是就寫成了三個method(有一個是從別人程式碼上複製下來的)


 


public Class UrlEncoderDecoder{


   /**    * Array containing the safe characters set as defined by RFC 1738    */


   private static BitSet safeCharacters;   //http://www.java2s.com/Code/Java/Network-Protocol/ProvidesamethodtoencodeanystringintoaURLsafeform.htm


private static final char[] hexadecimal = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};


  static {

       safeCharacters = new BitSet(256);

       int i;

       // 'lowalpha' rule

       for (i = 'a'; i <= 'z'; i++) {

           safeCharacters.set(i);

       }

       // 'hialpha' rule

       for (i = 'A'; i <= 'Z'; i++) {

           safeCharacters.set(i);

       }

       // 'digit' rule

       for (i = '0'; i <= '9'; i++) {

           safeCharacters.set(i);

       }



       // 'safe' rule

       safeCharacters.set('$');

       safeCharacters.set('-');

       safeCharacters.set('_');

       safeCharacters.set('.');

       safeCharacters.set('+');



       // 'extra' rule


       safeCharacters.set('!');

       safeCharacters.set('*');

       safeCharacters.set('\'');

       safeCharacters.set('(');

       safeCharacters.set(')');

       safeCharacters.set(',');



       // special characters common to http: file: and ftp: URLs ('fsegment' and 'hsegment' rules)

       safeCharacters.set('/');

       safeCharacters.set(':');

       safeCharacters.set('@');

       safeCharacters.set('&');

       safeCharacters.set('=');

   }
public static String decodePath(String path) throws IOException{

       int maxBytesPerChar = 10;

       StringBuffer bufferPath = new StringBuffer(path);

       ByteArrayOutputStream decodePath = new ByteArrayOutputStream(maxBytesPerChar);

      

       for(int i=0; i<bufferPath.length(); i++){

           if(bufferPath.charAt(i)!='%'){

               decodePath.write(bufferPath.charAt(i));

           }else{

               char mixChar =(char) Integer.parseInt(bufferPath.substring(i+1,i+3),16);

               i+=2;

               decodePath.write(mixChar);

           }

       }
      
//這裡要注意,只有utf-8會被順利decode,其他特殊字元不會被成功decode回去


       return decodePath.toString("UTF-8");

   }

   

   

   /**


    * Encode a path as required by the URL specification (<a href="http://www.ietf.org/rfc/rfc1738.txt">


    * RFC 1738</a>). This differs from <code>java.net.URLEncoder.encode()</code> which encodes according


    * to the <code>x-www-form-urlencoded</code> MIME format.


    *


    * @param path the path to encode


    * @return the encoded path


    */


   public static String encodePath(String path) {

      // stolen from org.apache.catalina.servlets.DefaultServlet ;)



       /**

        * Note: Here, ' ' should be encoded as "%20"


        * and '/' shouldn't be encoded.


        */




       int maxBytesPerChar = 10;

       StringBuffer rewrittenPath = new StringBuffer(path.length());

       ByteArrayOutputStream buf = new ByteArrayOutputStream(maxBytesPerChar);

       OutputStreamWriter writer;

       try {

           writer = new OutputStreamWriter(buf, "UTF8");

       } catch (Exception e) {

           e.printStackTrace();

           writer = new OutputStreamWriter(buf);

       }



       for (int i = 0; i < path.length(); i++) {

           int c = path.charAt(i);

           if (safeCharacters.get(c)) {

               rewrittenPath.append((char)c);

           } else {

               // convert to external encoding before hex conversion

               try {

                   writer.write(c);

                   writer.flush();

               } catch(IOException e) {

                   buf.reset();

                   continue;

               }

               byte[] ba = buf.toByteArray();

               for (int j = 0; j < ba.length; j++) {

                   // Converting each byte in the buffer

                   byte toEncode = ba[j];

                   rewrittenPath.append('%');

                   int low = (toEncode & 0x0f);

                   int high = ((toEncode & 0xf0) >> 4);

                   

                   rewrittenPath.append(hexadecimal[high]);

                   rewrittenPath.append(hexadecimal[low]);

               }

               buf.reset();

           }

       }

       return rewrittenPath.toString();

   }





   public static String encodeToLegalPath(String path){

       int maxBytesPerChar = 10;

       StringBuffer rewrittenPath = new StringBuffer(path.length());

       ByteArrayOutputStream buf = new ByteArrayOutputStream(maxBytesPerChar);

       OutputStreamWriter writer;

       try {

           writer = new OutputStreamWriter(buf, "UTF8");

       } catch (Exception e) {

           e.printStackTrace();

           writer = new OutputStreamWriter(buf);

       }



       for (int i = 0; i < path.length(); i++) {

           int c = path.charAt(i);

           if (safeCharacters.get(c) || (char) c == '%') { //其實只加了這一小行,讓已經被encode的值不要再被encode了

               rewrittenPath.append((char)c);

           } else {

               try {

                   writer.write(c);

                   writer.flush();

               } catch(IOException e) {

                   buf.reset();

                   continue;

               }

               byte[] ba = buf.toByteArray();

               for (int j = 0; j < ba.length; j++) {

                   byte toEncode = ba[j];

                   rewrittenPath.append('%');

                   int low = (toEncode & 0x0f);

                   int high = ((toEncode & 0xf0) >> 4);

                   

                   rewrittenPath.append(hexadecimal[high]);

                   rewrittenPath.append(hexadecimal[low]);

               }

               buf.reset();

           }

       }

       return rewrittenPath.toString();

   }


}


4 則留言: