Valhalla Legends Forums Archive | MSN Client/Bot Development | UTF-8 conversion routines (C++, Win32)

AuthorMessageTime
SkywingI've spent a good chunk of time figuring out how to properly encode and decode UTF-8 characters, so I figure that I might as well save somebody else some grief and post what I came up with.

If you are using this with Windows 95, you'll have to use the Microsoft Layer for Unicode (blech!).  It's natively supported on Windows 98 and Windows NT 4.0 or later.

Code:
// Convert 8-bit characters from the active locale to UTF-8.
// Text and Result MAY be the same buffer.
string& UTF8Encode(IN const string& Text, OUT string& Result)
{
        int InputChars = strlen(Text.c_str())+0;        // MSVCP strings tend to append extra nulls, so don't
                                                                                                // process them.

        // We need to first convert the ASCII input to Unicode before we can convert it to UTF-8...
        int UnicodeChars = MultiByteToWideChar(CP_ACP, 0, Text.c_str(), InputChars, 0, 0);
        LPWSTR UnicodeBuffer = (LPWSTR)alloca(UnicodeChars*sizeof(WCHAR));
        MultiByteToWideChar(CP_ACP, 0, Text.c_str(), InputChars, UnicodeBuffer, UnicodeChars);

        // Now that we've got everything translated to Unicode, we can (finally) convert it to UTF-8.
        int UTF8Chars = WideCharToMultiByte(CP_UTF8, 0, UnicodeBuffer, UnicodeChars, 0, 0, 0, 0);
        LPSTR UTF8Buffer = (LPSTR)alloca(UTF8Chars);
        WideCharToMultiByte(CP_UTF8, 0, UnicodeBuffer, UnicodeChars, UTF8Buffer, UTF8Chars, 0, 0);

        // Store it in the std::string passed in.  Note that this string will be null terminated just like
        // any other C-style string, but it'll have the special encodings for multibyte characters.
        Result.assign(UTF8Buffer, UTF8Chars);

        return Result;
}

// Convert UTF-8 characters to 8-bit characters from the active locale.
// UTF8 and Result MAY be the same buffer.
string& UTF8Decode(IN const string& UTF8, OUT string& Result)
{
        int InputBytes = strlen(UTF8.c_str())+0;        // MSVCP strings tend to append extra nulls, so don't
                                                                                                // process them.

        // Again, we need to convert the UTF-8 string to Unicode before we can convert it to 8-bit.
        int UnicodeChars = MultiByteToWideChar(CP_UTF8, 0, UTF8.c_str(), InputBytes, 0, 0);
        LPWSTR UnicodeBuffer = (LPWSTR)alloca(UnicodeChars*sizeof(WCHAR));
        MultiByteToWideChar(CP_UTF8, 0, UTF8.c_str(), InputBytes, UnicodeBuffer, UnicodeChars);

        // Now that we've got everything translated to Unicode, we can convert it to 8-bit characters.
        int SingleByteChars = WideCharToMultiByte(CP_ACP, 0, UnicodeBuffer, UnicodeChars, 0, 0, 0, 0);
        LPSTR SingleByteBuffer = (LPSTR)alloca(SingleByteChars);
        WideCharToMultiByte(CP_ACP, 0, UnicodeBuffer, UnicodeChars, SingleByteBuffer, SingleByteChars, 0,
                0);

        Result.assign(SingleByteBuffer, SingleByteChars);

        return Result;
}
April 12, 2003, 06:49 PM
Camelor in vb:
Code:
'Props to Skywing for UTF8 encoding in C++!
'https://davnit.net/bnet/vL/phpbbs/index.php?board=18;action=display;threadid=1027&start=0

Public Declare Function MultiByteToWideChar Lib "kernel32" (ByVal CodePage As Long, ByVal dwFlags As Long, ByVal lpMultiByteStr As String, ByVal cchMultiByte As Long, ByVal lpWideCharStr As String, ByVal cchWideChar As Long) As Long
Public Declare Function WideCharToMultiByte Lib "kernel32" (ByVal CodePage As Long, ByVal dwFlags As Long, ByVal lpWideCharStr As String, ByVal cchWideChar As Long, ByVal lpMultiByteStr As String, ByVal cchMultiByte As Long, ByVal lpDefaultChar As String, ByVal lpUsedDefaultChar As Long) As Long
Public Const CP_ACP = 0
Public Const CP_UTF8 = 65001

Public Function UTF8Encode(str As String) As String
    Dim InputChars As Long
    InputChars = Len(str)
   
    'We need to first convert the ASCII input to Unicode before we can convert it to UTF-8...
    Dim UnicodeChars As Long, UnicodeBuffer As String
    UnicodeChars = MultiByteToWideChar(CP_ACP, 0, str, InputChars, vbNullString, 0)
    UnicodeBuffer = Space(UnicodeChars * 2)
    MultiByteToWideChar CP_ACP, 0, str, InputChars, UnicodeBuffer, UnicodeChars
   
    'Now that we've got everything translated to Unicode, we can (finally) convert it to UTF-8.
    Dim UTF8Chars As Long, UTF8Buffer As String
    UTF8Chars = WideCharToMultiByte(CP_UTF8, 0, UnicodeBuffer, UnicodeChars, 0, 0, vbNullString, 0)
    UTF8Buffer = Space(UTF8Chars)
    WideCharToMultiByte CP_UTF8, 0, UnicodeBuffer, UnicodeChars, UTF8Buffer, UTF8Chars, vbNullString, 0
   
    UTF8Encode = UTF8Buffer
End Function

Public Function UTF8Decode(str As String) As String
    Dim InputBytes As Long
    InputBytes = Len(str)
   
    'Again, we need to convert the UTF-8 string to Unicode before we can convert it to 8-bit.
    Dim UnicodeChars As Long, UnicodeBuffer As String
    UnicodeChars = MultiByteToWideChar(CP_UTF8, 0, str, InputBytes, vbNullString, 0)
    UnicodeBuffer = Space(UnicodeChars * 2)
    MultiByteToWideChar CP_UTF8, 0, str, InputBytes, UnicodeBuffer, UnicodeChars
   
    'Now that we've got everything translated to Unicode, we can convert it to 8-bit characters.
    Dim SingleByteChars As Long, SingleByteBuffer As String
    SingleByteChars = WideCharToMultiByte(CP_ACP, 0, UnicodeBuffer, UnicodeChars, vbNullString, 0, vbNullString, 0)
    SingleByteBuffer = Space(SingleByteChars)
    WideCharToMultiByte CP_ACP, 0, UnicodeBuffer, UnicodeChars, SingleByteBuffer, SingleByteChars, vbNullString, 0
   
    UTF8Decode = SingleByteBuffer
End Function
May 05, 2003, 05:56 PM