How to convert minizip wrapper to unicode?

I am trying to use the minzip wrapper from [http://www.zlib.net/] to copy a folder. It works fine as long as the filenames are in English. Has anyone tried to modify minizip to support unicode?

The modified code is posted below. The problem with this function, the second argument takes a const char * as input. When I do the conversion, it loses data and the filenames don't match.

eg: Chinese- η΅±δΈ€ η’Ό .txt becomes Chinese-t + Ζ’S + Γ‡tÑü.txt inside zip.

err = zipOpenNewFileInZip3_64(  zf,outstr.c_str(),&zi,
                                        NULL,0,NULL,0,NULL /* comment*/,
                                        (opt_compress_level != 0) ? Z_DEFLATED : 0,
                                        opt_compress_level,0,
                                        /* -MAX_WBITS, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY, */
                                        -MAX_WBITS, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY,
                                        password,crcFile, zip64);








minizip.c
Version 1.1, February 14h, 2010
sample part of the MiniZip project - ( http://www.winimage.com/zLibDll/minizip.html )

Copyright (C) 1998-2010 Gilles Vollant (minizip) ( http://www.winimage.com/zLibDll/minizip.html )

Modifications of Unzip for Zip64
Copyright (C) 2007-2008 Even Rouault

Modifications for Zip64 support on both zip and unzip
Copyright (C) 2009-2010 Mathias Svensson ( http://result42.com )
*/


#if (!defined(_WIN32)) && (!defined(WIN32)) && (!defined(__APPLE__))
#ifndef __USE_FILE_OFFSET64
#define __USE_FILE_OFFSET64
#endif
#ifndef __USE_LARGEFILE64
#define __USE_LARGEFILE64
#endif
#ifndef _LARGEFILE64_SOURCE
#define _LARGEFILE64_SOURCE
#endif
#ifndef _FILE_OFFSET_BIT
#define _FILE_OFFSET_BIT 64
#endif
#endif

#ifdef __APPLE__
// In darwin and perhaps other BSD variants off_t is a 64 bit value, hence no need for specific 64 bit functions
#define FOPEN_FUNC(filename, mode) fopen(filename, mode)
#define FTELLO_FUNC(stream) ftello(stream)
#define FSEEKO_FUNC(stream, offset, origin) fseeko(stream, offset, origin)
#else
#define FOPEN_FUNC(filename, mode) fopen64(filename, mode)
#define FTELLO_FUNC(stream) ftello64(stream)
#define FSEEKO_FUNC(stream, offset, origin) fseeko64(stream, offset, origin)
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <errno.h>
#include <fcntl.h>
#ifdef _WIN32
# include <direct.h>
# include <io.h>
#define GetCurrentDir _getcwd
#else
# include <unistd.h>
# include <utime.h>
# include <sys/types.h>
# include <sys/stat.h>
#endif

#include "zip.h"
#include "Shlwapi.h"

#ifdef _WIN32
#define USEWIN32IOAPI
#include "iowin32.h"
#endif

#include <windows.h>
#include <string>
#include <iostream>
#include <list>
#include <fstream>
#include <sstream>
#include <set>
using namespace std;

#define WRITEBUFFERSIZE (16384)
#define MAXFILENAME (256)

#ifdef _WIN32
uLong filetime(
    wchar_t *f,                /* name of file to get info on */
    tm_zip *tmzip,             /* return value: access, modific. and creation times */
    uLong *dt)           /* dostime */
{
    int ret = 0;
    {
        FILETIME ftLocal;
        HANDLE hFind;
        _WIN32_FIND_DATAW ff32;

        hFind = FindFirstFileW(f,&ff32);
        if (hFind != INVALID_HANDLE_VALUE)
        {
            FileTimeToLocalFileTime(&(ff32.ftLastWriteTime),&ftLocal);
            FileTimeToDosDateTime(&ftLocal,((LPWORD)dt)+1,((LPWORD)dt)+0);
            FindClose(hFind);
            ret = 1;
        }
    }
    return ret;
}
#else
#ifdef unix || __APPLE__
uLong filetime(f, tmzip, dt)
    char *f;               /* name of file to get info on */
tm_zip *tmzip;         /* return value: access, modific. and creation times */
uLong *dt;             /* dostime */
{
    int ret=0;
    struct stat s;        /* results of stat() */
    struct tm* filedate;
    time_t tm_t=0;

    if (strcmp(f,"-")!=0)
    {
        char name[MAXFILENAME+1];
        int len = strlen(f);
        if (len > MAXFILENAME)
            len = MAXFILENAME;

        strncpy(name, f,MAXFILENAME-1);
        /* strncpy doesnt append the trailing NULL, of the string is too long. */
        name[ MAXFILENAME ] = '\0';

        if (name[len - 1] == '/')
            name[len - 1] = '\0';
        /* not all systems allow stat'ing a file with / appended */
        if (stat(name,&s)==0)
        {
            tm_t = s.st_mtime;
            ret = 1;
        }
    }
    filedate = localtime(&tm_t);

    tmzip->tm_sec  = filedate->tm_sec;
    tmzip->tm_min  = filedate->tm_min;
    tmzip->tm_hour = filedate->tm_hour;
    tmzip->tm_mday = filedate->tm_mday;
    tmzip->tm_mon  = filedate->tm_mon ;
    tmzip->tm_year = filedate->tm_year;

    return ret;
}
#else
uLong filetime(f, tmzip, dt)
    char *f;                /* name of file to get info on */
tm_zip *tmzip;             /* return value: access, modific. and creation times */
uLong *dt;             /* dostime */
{
    return 0;
}
#endif
#endif

void addFileToList(list<wstring>& fileList, const wstring& directory, const set<wstring>& excludeFilterSet, const wstring& fileName )
{
    wstring fileExtension = fileName.substr(fileName.find_last_of(L".") + 1);
    if (!fileExtension.empty()) 
    {
        if (excludeFilterSet.find(fileExtension) != excludeFilterSet.end()) return;
    } 

    fileList.push_back(directory + fileName);
}

void GetFileListing(list<wstring>& fileList, wstring directory,const set<wstring>& excludeFilterSet,bool recursively=true)
{
    directory = directory + L"\\";
    wstring filter = directory + L"*";

    _WIN32_FIND_DATAW FindFileData;
    HANDLE hFind = FindFirstFileW(filter.c_str(), &FindFileData);

    if (hFind == INVALID_HANDLE_VALUE)
    {
        DWORD dwError = GetLastError();
        if (dwError != ERROR_FILE_NOT_FOUND)
        {
            //cout << "Invalid file handle for filter " << filter << ". Error is " << GetLastError() << endl;
        }
        return;
    }

    do
    {
        if (FindFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) 
        {
            if ((recursively) && (wcscmp(FindFileData.cFileName, L".") != 0) && (wcscmp(FindFileData.cFileName, L"..") != 0))
            {
                GetFileListing(fileList, directory + FindFileData.cFileName, excludeFilterSet);
            }
        } 
        else
        {
            addFileToList(fileList, directory, excludeFilterSet,FindFileData.cFileName);
        }
    } while (FindNextFileW(hFind, &FindFileData) != 0);

    DWORD dwError = GetLastError();
    FindClose(hFind);

    if (dwError != ERROR_NO_MORE_FILES)
    {
        //cout << "FindNextFile error. Error is "<< dwError << endl;
    }
}


int check_exist_file(wchar_t* filename)
{
    FILE* ftestexist;
    int ret = 1;
    //ftestexist = FOPEN_FUNC(filename,"rb");
    ftestexist = _wfopen(filename,L"rb");
    if (ftestexist==NULL)
        ret = 0;
    else
        fclose(ftestexist);
    return ret;
}

/* calculate the CRC32 of a file,
because to encrypt a file, we need known the CRC32 of the file before */
int getFileCrc(const wchar_t * filenameinzip,void*buf,unsigned long size_buf,unsigned long* result_crc)
{
    unsigned long calculate_crc=0;
    int err=ZIP_OK;
    //FILE * fin = FOPEN_FUNC(filenameinzip,"rb");
    FILE * fin = _wfopen(filenameinzip,L"rb");

    unsigned long size_read = 0;
    unsigned long total_read = 0;
    if (fin==NULL)
    {
        err = ZIP_ERRNO;
    }

    if (err == ZIP_OK)
        do
        {
            err = ZIP_OK;
            size_read = (int)fread(buf,1,size_buf,fin);
            if (size_read < size_buf)
                if (feof(fin)==0)
                {
                    printf("error in reading %s\n",filenameinzip);
                    err = ZIP_ERRNO;
                }

                if (size_read>0)
                    calculate_crc = crc32(calculate_crc,(const Bytef *)buf,size_read);
                total_read += size_read;

        } while ((err == ZIP_OK) && (size_read>0));

        if (fin)
            fclose(fin);

        *result_crc=calculate_crc;
        printf("file %s crc %lx\n", filenameinzip, calculate_crc);
        return err;
}

int isLargeFile(const wchar_t * filename)
{
    int largeFile = 0;
    ZPOS64_T pos = 0;
    //FILE* pFile = FOPEN_FUNC(filename, "rb");
    FILE* pFile = _wfopen(filename, L"rb");

    if(pFile != NULL)
    {
        int n = FSEEKO_FUNC(pFile, 0, SEEK_END);
        pos = FTELLO_FUNC(pFile);

        printf("File : %s is %lld bytes\n", filename, pos);

        if(pos >= 0xffffffff)
            largeFile = 1;

        fclose(pFile);
    }

    return largeFile;
}

void split( const wstring& text, wchar_t delimiter,set<wstring>& result )
{
    wstring::size_type start = 0;
    wstring::size_type end   = text.find( delimiter, start );
    wstring token;

    while( end != wstring::npos )
    {
        token = text.substr( start, end - start );
        token.erase(0,2);
        result.insert( token );
        start = end + 1;
        end   = text.find( delimiter, start );
    }

    // Add the last string
    token = text.substr(start);
    token = token.erase(0,2);
    result.insert(token);
}

//Do not call me.
long getUTF8size(const wchar_t *string){
    if (!string)
        return 0;
    long res=0;
    for (;*string;string++){
        if (*string<0x80)
            res++;
        else if (*string<0x800)
            res+=2;
        else
            res+=3;
    }
    return res;
}

char *WChar_to_UTF8(const wchar_t *string){
    long fSize=getUTF8size(string);
    char *res=new char[fSize+1];
    res[fSize]=0;
    if (!string)
        return res;
    long b=0;
    for (;*string;string++,b++){
        if (*string<0x80)
            res[b]=(char)*string;
        else if (*string<0x800){
            res[b++]=(*string>>6)|192;
            res[b]=*string&63|128;
        }else{
            res[b++]=(*string>>12)|224;
            res[b++]=((*string&4095)>>6)|128;
            res[b]=*string&63|128;
        }
    }
    return res;
}


std::string utf8_encode(const std::wstring &wstr)
{
    int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL);
    std::string strTo( size_needed, 0 );
    WideCharToMultiByte                  (CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL);
    return strTo;
}


wstring zipper( const wstring& directoryPath, const wstring& strExcludeFilter, wstring & zipFileName )
{
    int opt_overwrite=0,opt_compress_level=Z_BEST_COMPRESSION,opt_exclude_path=0,err=0,size_buf=0;
    void* buf=NULL;
    const char* password=NULL;
    list<wstring> fileList;
    DWORD dwRet;

    wchar_t cCurrentPath[MAX_PATH];
    dwRet = GetCurrentDirectoryW(MAX_PATH, cCurrentPath);
    if( dwRet == 0 )
    {
        return wstring();
    }

    // Change the directory to the current folder
    _wchdir(directoryPath.c_str());
    set<wstring> excludeFilterSet;
    split(strExcludeFilter,'|',excludeFilterSet);

    GetFileListing(fileList, directoryPath,excludeFilterSet);
    opt_overwrite = 1;

    size_buf = WRITEBUFFERSIZE;
    buf = (void*)malloc(size_buf);
    if (buf==NULL) return wstring();

    wchar_t tempDirPath[MAX_PATH];
    dwRet = GetTempPathW (MAX_PATH, tempDirPath);
    if( dwRet == 0 ) return wstring();

    wstring directoryName,zipFilePath;
    _WIN32_FIND_DATAW FindFileData;
    HANDLE hFind = FindFirstFileW(directoryPath.c_str(), &FindFileData);
    if (FindFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) 
    {
        directoryName = FindFileData.cFileName;
    }

    zipFilePath = wstring(tempDirPath)+L"\\"+directoryName+L".zip";
    zipFile zf;
    int errclose;

    #ifdef USEWIN32IOAPI
        zlib_filefunc64_def ffunc;
        fill_win32_filefunc64W (&ffunc);
        zf = zipOpen2_64(zipFilePath.c_str(),(opt_overwrite==2) ? 2 : 0,NULL,&ffunc);
    #   else
        zf = zipOpen64(zipFilePath.c_str(),(opt_overwrite==2) ? 2 : 0);
    #   endif

    if (zf == NULL)
    {
        //printf("error opening %s\n",filename_try);
        err= ZIP_ERRNO;
    }
    else
    {
        //printf("creating %s\n",filename_try);
    }

    for(list<wstring>::iterator it = fileList.begin() ; it!=fileList.end();++it) 
    {
        FILE * fin;
        int size_read;
        //const char* filenameinzip = (*it).c_str();
        wstring filenameinzip = (*it).c_str();
        wchar_t szOut[MAX_PATH];

        PathRelativePathToW(szOut,
                            directoryPath.c_str(),
                            FILE_ATTRIBUTE_DIRECTORY,
                            filenameinzip.c_str(),
                            FILE_ATTRIBUTE_NORMAL);

        wchar_t *savefilenameinzip;
        zip_fileinfo zi;
        unsigned long crcFile=0;
        int zip64 = 0;

        zi.tmz_date.tm_sec = zi.tmz_date.tm_min = zi.tmz_date.tm_hour =
            zi.tmz_date.tm_mday = zi.tmz_date.tm_mon = zi.tmz_date.tm_year = 0;
        zi.dosDate = 0;
        zi.internal_fa = 0;
    zi.external_fa = 0;
        filetime(szOut,&zi.tmz_date,&zi.dosDate);

        if ((password != NULL) && (err==ZIP_OK))
            err = getFileCrc(szOut,buf,size_buf,&crcFile);

        zip64 = isLargeFile(szOut);

        /* The path name saved, should not include a leading slash. */
        /*if it did, windows/xp and dynazip couldn't read the zip file. */
        savefilenameinzip = szOut;
        while( savefilenameinzip[0] == '\\' || savefilenameinzip[0] == '/' )
        {
            savefilenameinzip++;
        }

        string outstr = utf8_encode(savefilenameinzip);
        //char * op = (char*)outstr.c_str();


        err = zipOpenNewFileInZip3_64(  zf,outstr.c_str(),&zi,
                                        NULL,0,NULL,0,NULL /* comment*/,
                                        (opt_compress_level != 0) ? Z_DEFLATED : 0,
                                        opt_compress_level,0,
                                        /* -MAX_WBITS, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY, */
                                        -MAX_WBITS, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY,
                                        password,crcFile, zip64);

        if (err != ZIP_OK)
        {
            //printf("error in opening %s in zipfile\n",szOut);
        }
        else
        {
            //fin = FOPEN_FUNC(szOut,"rb");
            fin = _wfopen(szOut,L"rb");

            if (fin==NULL)
            {
                err=ZIP_ERRNO;
                //printf("error in opening %s for reading\n",szOut);
            }
        }

        if (err == ZIP_OK)
            do
            {
                err = ZIP_OK;
                size_read = (int)fread(buf,1,size_buf,fin);
                if (size_read < size_buf)
                    if (feof(fin)==0)
                    {
                        //printf("error in reading %s\n",szOut);
                        err = ZIP_ERRNO;
                    }

                    if (size_read>0)
                    {
                        err = zipWriteInFileInZip (zf,buf,size_read);
                        if (err<0)
                        {
                            //printf("error in writing %s in the zipfile\n",szOut);
                        }

                    }
            } while ((err == ZIP_OK) && (size_read>0));

            if (fin)
                fclose(fin);

            if (err<0)
                err=ZIP_ERRNO;
            else
            {
                err = zipCloseFileInZip(zf);
                if (err!=ZIP_OK)
                {
                    //printf("error in closing %s in the zipfile\n",szOut);
                }
            }
    }

    errclose = zipClose(zf,NULL);
    if (errclose != ZIP_OK)
    {
        //printf("error in closing %s\n",filename_try);
    }

    free(buf);
    // Change back the executabe context
    _wchdir(cCurrentPath);
    return zipFilePath;
}

      

+3


source to share


2 answers


The official way of storing UTF-8 filenames in a ZIP file, as per the standard , is setting "bit 11 general purpose" Looking at minizip sources it seems to me that minizip will not set this bit for you at any time and that zipOpenNewFileInZip3_64

makes it impossible transmit this bit. However, there is zipOpenNewFileInZip4_64

one that takes two more arguments versionMadeBy

and flagBase

. This way you can store UTF-8 filenames according to the standard by changing your call to

err = zipOpenNewFileInZip4_64(zf, outstr.c_str(), […], crcFile, 36, 1<<11, zip64);

      

This assumes it outstr

does indeed contain a valid UTF-8 encoding of your filename, which the source code suggests but which I haven't tested yet. I suggest you print the hexadecimal byte values outstr

to test this. Unless I distorted the string in the process, your "η΅±δΈ€ η’Ό .txt" should become e7 b5 b1 e4 b8 80 e7 a2 bc 2e 74 78 74

in UTF-8 hexadecimal.



For more on this field versionMadeBy

(which I set in 36

in my call), see section 4.4.2 of the standard. It depends on which platform you are using, in which format the file attributes from the argument zipfi

( &zi

in your case) are, and which version of the standard fixes everything. Since you are using Unicode filenames, I would say you are using version 6.3. * Standard, so the least significant byte should be 36

. And since the wrapper minizip.c

doesn't store any file attributes at all, you don't need to specify the platform there. These lines show the absence of attributes:

    zi.internal_fa = 0;
    zi.external_fa = 0;

      

Note that even though the standard provides a way to denote filenames in Unicode, this part was only added in 2006, and there may be many ZIP applications that do not support it. Therefore, even if your archive is correct, your unzip utility may still unzip that file incorrectly, interpreting the UTF-8 bytes as code page 437 or Latin 1 or similar.

+7


source


Changing the program to handle UTF-8 correctly is not trivial, take a look at the Unicode FAQ . A program that handles possibly malicious data (like compress / decompress) should be extra careful. Worth standing, definitely not trivial.



0


source







All Articles