ورود

View Full Version : خواندن فایل با فرمت utf -8



zehs_sha
پنج شنبه 17 شهریور 1384, 12:42 عصر
خواندن فایل با فرمت utf -8



import java.io.*;
/**
* http://h21007.www2.hp.com/dspp/tech/tech_TechDocumentDetailPage_IDX/1,1701,5754,00.html
* This class provides efficient UTF8 reading capability as a sub-class
of
* java.io.Reader. It is able to buffer, and translate from an UTF8
* input stream to Unicode characters. Method readLine() can return
* a single line of material as a Java string.
*
* For best performance, call
*
* new UTF8InputStreamReader(in, 2048) instead of
*
* new BufferedReader(
* new InputStreamReader(
* new BufferedInputStream(in,2048),"UTF8"
* ), 1
* )
*/
public class UTF8InputStreamReader extends Reader
{
private InputStream ins;
private byte bytebuf[];
private int byteptr = 0;
private int numbytes = 0;
private int hasExtra = 0;
private char extraCh = 0;
/* bufsize of 8192 bytes would be good,
no need to wrap in a buffered reader */
public UTF8InputStreamReader(InputStream i, int bufsize)
{
ins = i;
bytebuf = new byte[bufsize];
}
public String getEncoding()
{
return "UTF8";
}
void checkOpen() throws IOException
{
if (ins == null)
throw new IOException("Stream closed");
}
private int morebyte() throws IOException
{
if (byteptr < numbytes)
{
return 0xff & bytebuf[byteptr++];
}
byteptr = 0;
numbytes = ins.read(bytebuf); // fill buffer from underlying stream
if (numbytes > 0)
{
return 0xff & bytebuf[byteptr++];
}
numbytes = 0;
return -1; // hit EOF
}
void something_wrong() throws IOException
{
throw new sun.io.MalformedInputException();
}
private int translate() throws IOException
{
int char1, char2, char3, char4;
if (hasExtra > 0)
{
hasExtra = 0;
return extraCh;
}
char1 = morebyte();
if (char1 < 0) return char1; // EOF
if (0 == (char1 & 0x80))
return char1; // 1 byte UTF
switch (char1 >> 4)
{
case 0xc:
case 0xd:
char2 = morebyte();
if (char2 < 0) something_wrong(); // EOF
if ((char2 & 0xc0) != 0x80) something_wrong();
return ((char1 & 0x1f) << 6) | (char2 & 0x3f); // 2 byte UTF
case 0xe:
char2 = morebyte();
if (char2 < 0) something_wrong(); // EOF
char3 = morebyte();
if (char3 < 0) something_wrong(); // EOF
if (((char2 & 0xc0) != 0x80) || ((char3 & 0xc0) != 0x80))
something_wrong();
return ((char1 & 0xf) << 12) | ((char2 & 0x3f) << 6) | (char3 &
0x3f);
// 3 byte UTF
case 0xf:
{
// 4 byte UTF
char2 = morebyte();
if (char2 < 0) something_wrong(); // EOF
char3 = morebyte();
if (char3 < 0) something_wrong(); // EOF
char4 = morebyte();
if (char4 < 0) something_wrong(); // EOF
if (((char2 & 0xc0) != 0x80) || ((char3 & 0xc0) != 0x80) ||
((char4 & 0xc0) != 0x80))
something_wrong();
int a4 = ((char1 & 0x7) << 18) | ((char2 & 0x3f) << 12) |
((char3 & 0x3f) << 6) | (char4 & 0x3f);
hasExtra = 1;
extraCh = (char) ((a4 - 0x10000) % 0x400 + 0xdc00);
return (char) ((a4 - 0x10000) / 0x400 + 0xd800);
}
default:
throw new sun.io.MalformedInputException();
}
}
public int read() throws IOException
{
synchronized (lock) { return translate(); }
}
public int read(char cbuf[], int off, int len) throws IOException
{
int end = off + len;
if ((len < 0) || (off < 0) || (cbuf.length < off) || (end < 0) ||
(cbuf.length < end) )
{
throw new IndexOutOfBoundsException();
}
int result = 0;
synchronized (lock)
{
checkOpen();
if (len == 0)
return 0;
for (; len > 0; --len)
{
int ch = translate();
if (ch < 0)
break;
cbuf[off++] = (char) ch;
++result;
}
}
return (result == 0) ? -1 : result;
}
/**
* Read from UTF8 stream until end of line, and return the content
* as a Java String. A line is considered to be terminated by any one
* of a line feed ('\n'), a carriage return ('\r'), or a carriage
return
* followed by a linefeed.
*
* @return A String containing the contents of the line, not
including
* any line-termination characters, or null if the end of
the
* stream has been reached
*
* @exception IOException If an I/O error occurs
*/
public String readLine() throws IOException
{
StringBuffer s = null;
synchronized (lock)
{
checkOpen();
for (;;)
{
int ch = translate();
if (ch < 0) // eof
return (s == null) ? null : s.toString();
if (ch == (int) '\n')
break;
if (ch == (int) '\r')
{
int ch2 = translate();
if ((ch2 != (int) '\n') && (ch2 >= 0))
{
hasExtra = 1;
extraCh = (char) ch2; // put back
}
break;
}
if (s == null)
s = new StringBuffer(80);
s.append((char) ch);
}
}
return (s == null) ? "" : s.toString();
}
public boolean ready() throws IOException
{
synchronized (lock)
{
checkOpen();
try
{
return (hasExtra > 0) || (numbytes > byteptr) ||
(ins.available() > 0);
}
catch (IOException e1)
{
return false;
}
}
}
public void close() throws IOException
{
synchronized (lock)
{
if (ins != null)
{
ins.close();
ins = null;
bytebuf = null;
}
}
}
}