How to Handle non - UTF8 html pages

Total Views : 50
Zoom In Zoom Out Read Later Print

This code will help you to know how you can get content of a non - utf8 web/html page. Try with bot encoding to check what works for you.

NonUTF8HtmlPage.java

package com.prodevsblog.how_to.network.url;

import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.zip.GZIPInputStream;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;

public class NonUTF8HtmlPage {

    public static void main(String[] args) {
        StringBuffer buffer = new StringBuffer();

        try {
            URL url = new URL("https://www.chinadialogue.net/");

            HttpURLConnection conn = (HttpURLConnection) url.openConnection();
            HttpURLConnection.setFollowRedirects(true);

            conn.setRequestProperty("Accept-Encoding", "gzip, deflate");
            String encoding = conn.getContentEncoding();
            InputStream inStr = null;

            if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
                inStr = new GZIPInputStream(conn.getInputStream());

            } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
                inStr = new InflaterInputStream(conn.getInputStream(), new Inflater(true));

            } else {
                inStr = conn.getInputStream();
            }
            int ptr = 0;
            // the most important part of code ..........................
            // "UTF-8" works for chinese and Japanese Languages - Try if its fits
            InputStreamReader inStrReader = new InputStreamReader(inStr, Charset.forName("GB2312"));

            while ((ptr = inStrReader.read()) != -1) {
                buffer.append((char) ptr);
            }
            inStrReader.close();
            conn.disconnect();
            inStr.close();

        } catch (Exception e) {
            e.printStackTrace();
        }

        // Print Html Page Content
        System.out.println(buffer.toString());
    }
}

Output of the code.

Output:

<!DOCTYPE html>
<html class='no-js' lang='en'>
<!--
node2

-->
<head>
<meta content='text/html; charset=UTF-8' http-equiv='Content-Type'>
<meta charset='utf-8'>
<meta content='width=device-width, initial-scale=1.0' name='viewport'>
<meta content='bbd965df93267a91b48dccdbe9c6be3713ebeb188bc518c6ca5f33f295dea7e4' name='ahrefs-site-verification'>
<meta content='chinadialogue is a bilingual website, with news, features and reports on environmental issues in China and the rest of the world' name='description'>
<meta content='false' name='cn-user'>
<title>
中外对话 chinadialogue | china and the environment
</title>

. . . 
. . . .
. . . . .
. . . . . .
. . . . . . .

<p>
特此申明:中外对话不对任何外部链接的内容承担责任。chinadialogue is not responsible for the content of linked external sites.
</p>
<p>
<a href="https://www.chinadialogue.net/pages/ch/privacy-notice">隐私声明</a>
<a href="https://www.chinadialogue.net/pages/en/privacy-notice">Privacy</a>
<a href="https://www.chinadialogue.net/pages/ch/cookies-notice">选项</a>
<a href="https://www.chinadialogue.net/pages/en/cookies-notice">Cookies</a>
</p>
</div>
</div>
</footer>
</div>

<script>window.twttr = (function(d, s, id) {
  var js, fjs = d.getElementsByTagName(s)[0],
    t = window.twttr || {};
  if (d.getElementById(id)) return t;
  js = d.createElement(s);
  js.id = id;
  js.src = "https://platform.twitter.com/widgets.js";
  fjs.parentNode.insertBefore(js, fjs);

  t._e = [];
  t.ready = function(f) {
    t._e.push(f);
  };

  return t;
}(document, "script", "twitter-wjs"));</script>

</body>
</html>


Process finished with exit code 0

See More

Latest Photos