当前位置: 代码迷 >> 综合 >> UTF8/ANSI/UNICODE文件读取
  详细解决方案

UTF8/ANSI/UNICODE文件读取

热度:48   发布时间:2024-01-13 14:44:16.0
//判断文件类型
DWORD CHXScriptReal::CheckFileType( HANDLE hFile )
{HXScriptFileCheck sfc;DWORD dwcbSize;if( ReadFile( hFile, &sfc, sizeof( sfc ), &dwcbSize, NULL ) && ( dwcbSize > sizeof( sfc ))){if( IsBinFile( &sfc ))return HXSF_CODETYPE_BIN;}if( dwcbSize >= 3 && sfc.byBom[ 0 ] == 0xEF && sfc.byBom[ 1 ] == 0xBB && sfc.byBom[ 2 ] == 0xBF ){SetFilePointer( hFile, 3, NULL, FILE_BEGIN );return HXSF_CODETYPE_UTF8;}else if( dwcbSize >= 2 && sfc.byBom[ 0 ] == 0xFF && sfc.byBom[ 1 ] == 0xFE ){SetFilePointer( hFile, 2, NULL, FILE_BEGIN );return HXSF_CODETYPE_UNICODE;}else if( dwcbSize >= 2 && sfc.byBom[ 0 ] == 0xFE && sfc.byBom[ 1 ] == 0xFF ){SetFilePointer( hFile, 2, NULL, FILE_BEGIN );return HXSF_CODETYPE_UNICODE_BIGENDIAN;}else{SetFilePointer( hFile, 0, NULL, FILE_BEGIN );return HXSF_CODETYPE_ANSI;}
}
//读取一个字符
WCHAR CHXLexer::ReadNextCharFromFile()
{char  btChar;WCHAR ch;DWORD dwReaded;assert( m_hFile != NULL && m_hFile != INVALID_HANDLE_VALUE );switch( m_dwCodeType ){case HXSF_CODETYPE_UNICODE:if(( ! ReadFile( m_hFile, &ch, sizeof( WCHAR ), &dwReaded, NULL )) || ( dwReaded != sizeof( WCHAR )))ch = 0;break;case HXSF_CODETYPE_UNICODE_BIGENDIAN:if( ReadFile( m_hFile, &ch, sizeof( WCHAR ), &dwReaded, NULL ) && ( dwReaded == sizeof( WCHAR ))){WCHAR chTmp = ch;ch = chTmp << 8;ch |= ( chTmp >> 8 );}elsech = 0;break;case HXSF_CODETYPE_UTF8:if( ReadFile( m_hFile, &btChar, 1, &dwReaded, NULL ) && ( dwReaded == 1 )){char szch[ 8 ];WCHAR szwch[ 2 ];int n, i;if(( btChar & 0x80 ) == 0x00 )n = 1;else if(( btChar & 0xE0 ) == 0xC0 )n = 2;else if(( btChar & 0xF0 ) == 0xE0 )n = 3;else if(( btChar & 0xF8 ) == 0xF0 )n = 4;else if(( btChar & 0xFC ) == 0xF8 )n = 5;else if(( btChar & 0xFE ) == 0xFC )n = 6;elsen = 0;szch[ 0 ] = btChar;for( i = 1; i < n; ++ i ){if( ReadFile( m_hFile, &btChar, 1, &dwReaded, NULL ) && ( dwReaded == 1 ))szch[ i ] = btChar;elsebreak;}szch[ i ] = 0;if( ::MultiByteToWideChar( CP_UTF8, 0, szch, i, szwch, 2 ) != 0 )ch = *szwch;elsech = 0;}elsech = 0;break;case HXSF_CODETYPE_ANSI:if( ReadFile( m_hFile, &btChar, 1, &dwReaded, NULL ) && ( dwReaded == 1 )){char szch[ 4 ];WCHAR szwch[ 2 ];int n = 1;szch[ 0 ] = btChar;if(( btChar & 0x80 ) == 0x80 ){if( ReadFile( m_hFile, &btChar, 1, &dwReaded, NULL ) && ( dwReaded == 1 )){szch[ 1 ] = btChar;szch[ 2 ] = 0;++n;}elseszch[ 1 ] = 0;}elseszch[ 1 ] = 0;if( ::MultiByteToWideChar( CP_ACP, 0, szch, n, szwch, 2 ) == 1 )ch =  *szwch;elsech = 0;}elsech = 0;break;}return ch;
}
//判断一个缓冲区是否为UTF8编码
BOOL CHXScriptReal::IsTextUTF8( BYTE * pszBuffer, int ncb )
{int i = 0;while( i < ncb ){int step = 0;if(( pszBuffer[ i ] & 0x80) == 0x00 ){step = 1;}else if(( pszBuffer[ i ] & 0xe0 ) == 0xc0 ){if( i + 1 >= ncb )return FALSE;if(( pszBuffer[ i + 1 ] & 0xc0 ) != 0x80 )return FALSE;step = 2;}else if(( pszBuffer[ i ] & 0xf0 ) == 0xe0 ){if( i + 2 >= ncb )return FALSE;if(( pszBuffer[ i + 1 ] & 0xc0 ) != 0x80 )return FALSE;if(( pszBuffer[ i + 2 ] & 0xc0 ) != 0x80 )return FALSE;step = 3;}elsereturn FALSE;i += step;}if( i == ncb )return TRUE;return FALSE;
}


  相关解决方案