diff --git a/backport-zstd-1.5.0-patch-4-limit-train-samples.patch b/backport-zstd-1.5.0-patch-4-limit-train-samples.patch new file mode 100644 index 0000000000000000000000000000000000000000..4d0cc51b2a7833212032c510949463a27bbb05b5 --- /dev/null +++ b/backport-zstd-1.5.0-patch-4-limit-train-samples.patch @@ -0,0 +1,399 @@ +diff -Nur zstd-1.5.0/lib/dictBuilder/cover.c new-zstd/lib/dictBuilder/cover.c +--- zstd-1.5.0/lib/dictBuilder/cover.c 2021-05-14 22:59:34.000000000 +0800 ++++ new-zstd/lib/dictBuilder/cover.c 2021-11-16 09:49:50.933861667 +0800 +@@ -40,6 +40,14 @@ + /*-************************************* + * Constants + ***************************************/ ++ ++/** ++ * There are 32bit indexes used to ref samples, so limit samples size to 4GB ++ * on 64bit builds ++ * For 32bit builds we choose 1 GB ++ * Most 32bit platforms have 2 GB user-mode addressable space and we allocate a large ++ * contigous buffer, so 1GB is already a high limit ++ */ + #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB)) + #define COVER_DEFAULT_SPLITPOINT 1.0 + +diff -Nur zstd-1.5.0/lib/dictBuilder/fastcover.c new-zstd/lib/dictBuilder/fastcover.c +--- zstd-1.5.0/lib/dictBuilder/fastcover.c 2021-05-14 22:59:34.000000000 +0800 ++++ new-zstd/lib/dictBuilder/fastcover.c 2021-11-16 09:52:36.621087274 +0800 +@@ -32,6 +32,14 @@ + /*-************************************* + * Constants + ***************************************/ ++/** ++ * There are 32bit indexes used to ref samples, so limit samples size to 4GB ++ * on 64bit builds ++ * For 32bit builds we choose 1 GB ++ * Most 32bit platforms have 2 GB user-mode addressable space and we allocate a large ++ * contigous buffer, so 1GB is already a high limit ++ */ ++ + #define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB)) + #define FASTCOVER_MAX_F 31 + #define FASTCOVER_MAX_ACCEL 10 +diff -Nur zstd-1.5.0/programs/dibio.c new-zstd/programs/dibio.c +--- zstd-1.5.0/programs/dibio.c 2021-05-14 22:59:34.000000000 +0800 ++++ new-zstd/programs/dibio.c 2021-11-16 15:04:46.351257422 +0800 +@@ -49,7 +49,7 @@ + static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); + + #define NOISELENGTH 32 +- ++#define MAX_SAMPLES_SIZE (2 GB) /*training dadaset limited to 2GB*/ + + /*-************************************* + * Console display +@@ -88,6 +88,15 @@ + #undef MIN + #define MIN(a,b) ((a) < (b) ? (a) : (b)) + ++/** ++ Returns the size of a file. ++ If error returns -1. ++*/ ++static S64 DiB_getFileSize (const char * fileName) ++{ ++ U64 const fileSize = UTIL_getFileSize(fileName); ++ return (fileSize == UTIL_FILESIZE_UNKNOWN) ? -1 : (S64)fileSize; ++} + + /* ******************************************************** + * File related operations +@@ -101,47 +110,66 @@ + * *bufferSizePtr is modified, it provides the amount data loaded within buffer. + * sampleSizes is filled with the size of each sample. + */ +-static unsigned DiB_loadFiles(void* buffer, size_t* bufferSizePtr, +- size_t* sampleSizes, unsigned sstSize, +- const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize, +- unsigned displayLevel) ++static int DiB_loadFiles( ++ void* buffer, size_t* bufferSizePtr, ++ size_t* sampleSizes, int sstSize, ++ const char** fileNamesTable, int nbFiles, ++ size_t targetChunkSize, int displayLevel) + { + char* const buff = (char*)buffer; +- size_t pos = 0; +- unsigned nbLoadedChunks = 0, fileIndex; +- +- for (fileIndex=0; fileIndex *bufferSizePtr-pos) break; +- { size_t const readSize = fread(buff+pos, 1, toLoad, f); +- if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName); +- pos += readSize; +- sampleSizes[nbLoadedChunks++] = toLoad; +- remainingToLoad -= targetChunkSize; +- if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */ +- fileIndex = nbFiles; /* stop there */ +- break; +- } +- if (toLoad < targetChunkSize) { +- fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR); +- } } } +- fclose(f); ++ size_t totalDataLoaded = 0; ++ int nbSamplesLoaded = 0; ++ int fileIndex = 0; ++ FILE * f = NULL; ++ assert(targetChunkSize <= SAMPLESIZE_MAX); ++ ++ while ( nbSamplesLoaded < sstSize && fileIndex < nbFiles ) { ++ size_t fileDataLoaded; ++ S64 const fileSize = DiB_getFileSize(fileNamesTable[fileIndex]); ++ if (fileSize <= 0) ++ continue; ++ ++ f = fopen( fileNamesTable[fileIndex], "rb"); ++ if (f == NULL) ++ EXM_THROW(10, "zstd: dictBuilder: %s %s ",fileNamesTable[fileIndex], strerror(errno)); ++ DISPLAYUPDATE(2, "Loading %s... \r", fileNamesTable[fileIndex]); ++ ++ /* Load the first chunk of data from the file */ ++ fileDataLoaded = targetChunkSize > 0 ? ++ (size_t)MIN(fileSize, (S64)targetChunkSize) : ++ (size_t)MIN(fileSize, SAMPLESIZE_MAX); ++ if (totalDataLoaded + fileDataLoaded > *bufferSizePtr) ++ break; ++ if (fread( buff+totalDataLoaded, 1, fileDataLoaded, f) != fileDataLoaded) ++ EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]); ++ sampleSizes[nbSamplesLoaded++] = fileDataLoaded; ++ totalDataLoaded += fileDataLoaded; ++ ++ /* If file-chunking is enabled, load the rest if the file as more samples */ ++ if (targetChunkSize > 0) { ++ while( (S64)fileDataLoaded < fileSize && nbSamplesLoaded < sstSize ) { ++ size_t const chunkSize = MIN((size_t)(fileSize-fileDataLoaded), targetChunkSize); ++ if (totalDataLoaded + chunkSize > *bufferSizePtr) /* buffer is full */ ++ break; ++ if (fread( buff+totalDataLoaded, 1, chunkSize, f) != chunkSize) ++ EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]); ++ sampleSizes[nbSamplesLoaded++] = chunkSize; ++ totalDataLoaded += chunkSize; ++ fileDataLoaded += chunkSize; ++ } ++ } ++ fileIndex += 1; ++ fclose(f); ++ f = NULL; + } ++ if (f != NULL) ++ fclose(f); ++ + DISPLAYLEVEL(2, "\r%79s\r", ""); +- *bufferSizePtr = pos; +- DISPLAYLEVEL(4, "loaded : %u KB \n", (unsigned)(pos >> 10)) +- return nbLoadedChunks; ++ DISPLAYLEVEL(4, "Loaded %d KB total taraining data, %d nb samples \n", ++ (int)(totalDataLoaded / (1 KB)), nbSamplesLoaded); ++ *bufferSizePtr = totalDataLoaded; ++ return nbSamplesLoaded; + } + + #define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r))) +@@ -225,9 +253,9 @@ + + + typedef struct { +- U64 totalSizeToLoad; +- unsigned oneSampleTooLarge; +- unsigned nbSamples; ++ S64 totalSizeToLoad; ++ int nbSamples; ++ int oneSampleTooLarge; + } fileStats; + + /*! DiB_fileStats() : +@@ -235,45 +263,86 @@ + * provides the amount of data to be loaded and the resulting nb of samples. + * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. + */ +-static fileStats DiB_fileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel) ++static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t chunkSize, int displayLevel) + { + fileStats fs; +- unsigned n; ++ int n; + memset(&fs, 0, sizeof(fs)); ++ ++ // We addume that if chunking is requsted, the chunk size is < SAMPLESIZE_MAX ++ assert( chunkSize <= SAMPLESIZE_MAX ); ++ + for (n=0; n 2*SAMPLESIZE_MAX); +- fs.nbSamples += nbSamples; ++ S64 const fileSize = DiB_getFileSize(fileNamesTable[n]); ++ //TODO: is there a minimum sample size? what if the file is 1-byte? ++ if (fileSize == 0) { ++ DISPLAYLEVEL(3, "Sample file '%s' has zero size, skipping...\n", fileNamesTable[n]); ++ continue; ++ } ++ ++ /* the case where we are breaking up files in sample chunks */ ++ if (chunkSize > 0) ++ { ++ // TODO: is there a minmum sample size? can we have a 1-byte sample? ++ fs.nbSamples += (int)((fileSize + chunkSize -1) / chunkSize); ++ fs.totalSizeToLoad += fileSize; ++ } ++ else { ++ /* the case where one file is one sample */ ++ if (fileSize > SAMPLESIZE_MAX) { ++ /* falg excessively large sample files */ ++ fs.oneSampleTooLarge |= (fileSize > 2*SAMPLESIZE_MAX); ++ ++ /* Limt to the first SAMPLESIZE_MAX (128KB) of the file */ ++ DISPLAYLEVEL(3, "Sample file '%s' is too large, limiting to %d KB", ++ fileNamesTable[n], SAMPLESIZE_MAX / (1 KB)); ++ } ++ fs.nbSamples += 1; ++ fs.totalSizeToLoad += MIN(fileSize, SAMPLESIZE_MAX); ++ } + } +- DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (unsigned)(fs.totalSizeToLoad >> 10)); ++ DISPLAYLEVEL(4, "Found training data %d files, %d samples\n", nbFiles, (int)(fs.totalSizeToLoad / (1 KB)), fs.nbSamples); + return fs; + } + + +-int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize, +- const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, ++int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize, ++ const char** fileNamesTable, int nbFiles, size_t chunkSize, + ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams, + ZDICT_fastCover_params_t* fastCoverParams, int optimize) + { +- unsigned const displayLevel = params ? params->zParams.notificationLevel : +- coverParams ? coverParams->zParams.notificationLevel : +- fastCoverParams ? fastCoverParams->zParams.notificationLevel : +- 0; /* should never happen */ ++ fileStats fs; ++ size_t* sampleSizes; ++ int nbSamplesLoaded; ++ size_t loadedSize; ++ void* srcBuffer; + void* const dictBuffer = malloc(maxDictSize); +- fileStats const fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); +- size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); +- size_t const memMult = params ? MEMMULT : +- coverParams ? COVER_MEMMULT: +- FASTCOVER_MEMMULT; +- size_t const maxMem = DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult; +- size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad); +- void* const srcBuffer = malloc(loadedSize+NOISELENGTH); + int result = 0; ++ ++ int const displayLevel = params ? params->zParams.notificationLevel : ++ coverParams ? coverParams->zParams.notificationLevel : ++ fastCoverParams ? fastCoverParams->zParams.notificationLevel : 0; ++ /* Shuffle input files before we start assessing hao much sample data to load. ++ The purpose of the shuffle is to pick random samples when the sample ++ set is large than what we can load in memory*/ ++ DISPLAYLEVEL(3, "shuffling input files\n"); ++ DiB_shuffle(fileNamesTable, nbFiles); ++ ++ /* Figure out how much samples data to load with how samples*/ ++ fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); ++ ++ { ++ int const memMult = params ? MEMMULT : ++ coverParams ? COVER_MEMMULT: ++ FASTCOVER_MEMMULT; ++ size_t const maxMem = DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult; ++ /* Limit the Size of the training data to the free memory */ ++ /* Limit the Size of the training data to the 2GB */ ++ /* TODO: there is oportunity to stop DiB_fileStats() early when the data limit is reached */ ++ loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE ); ++ srcBuffer = malloc(loadedSize+NOISELENGTH); ++ sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); ++ } + + /* Checks */ + if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer)) +@@ -289,31 +358,31 @@ + DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n"); + EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */ + } +- if (fs.totalSizeToLoad < (unsigned long long)maxDictSize * 8) { ++ if (fs.totalSizeToLoad < (S64)maxDictSize * 8) { + DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n"); + DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n"); + } + + /* init */ +- if (loadedSize < fs.totalSizeToLoad) +- DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20)); ++ if ((S64)loadedSize < fs.totalSizeToLoad) ++ DISPLAYLEVEL(1, "Training samples set too large (%u MB); training on %u MB only...\n", ++ (unsigned)(fs.totalSizeToLoad / (1 MB)), ++ (unsigned)(loadedSize / (1 MB))); + + /* Load input buffer */ +- DISPLAYLEVEL(3, "Shuffling input files\n"); +- DiB_shuffle(fileNamesTable, nbFiles); +- +- DiB_loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel); +- ++ nbSamplesLoaded = DiB_loadFiles( ++ srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, ++ fileNamesTable, nbFiles, chunkSize, displayLevel); + { size_t dictSize; + if (params) { + DiB_fillNoise((char*)srcBuffer + loadedSize, NOISELENGTH); /* guard band, for end of buffer condition */ + dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize, +- srcBuffer, sampleSizes, fs.nbSamples, ++ srcBuffer, sampleSizes, nbSamplesLoaded, + *params); + } else if (coverParams) { + if (optimize) { + dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, +- srcBuffer, sampleSizes, fs.nbSamples, ++ srcBuffer, sampleSizes, nbSamplesLoaded, + coverParams); + if (!ZDICT_isError(dictSize)) { + unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100); +@@ -322,13 +391,13 @@ + } + } else { + dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer, +- sampleSizes, fs.nbSamples, *coverParams); ++ sampleSizes, nbSamplesLoaded, *coverParams); + } + } else { + assert(fastCoverParams != NULL); + if (optimize) { + dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, +- srcBuffer, sampleSizes, fs.nbSamples, ++ srcBuffer, sampleSizes, nbSamplesLoaded, + fastCoverParams); + if (!ZDICT_isError(dictSize)) { + unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100); +@@ -338,7 +407,7 @@ + } + } else { + dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer, +- sampleSizes, fs.nbSamples, *fastCoverParams); ++ sampleSizes, nbSamplesLoaded, *fastCoverParams); + } + } + if (ZDICT_isError(dictSize)) { +diff -Nur zstd-1.5.0/programs/dibio.h new-zstd/programs/dibio.h +--- zstd-1.5.0/programs/dibio.h 2021-05-14 22:59:34.000000000 +0800 ++++ new-zstd/programs/dibio.h 2021-11-16 14:27:26.675384927 +0800 +@@ -31,8 +31,8 @@ + `parameters` is optional and can be provided with values set to 0, meaning "default". + @return : 0 == ok. Any other : error. + */ +-int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize, +- const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, ++int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize, ++ const char** fileNamesTable, int nbFiles, size_t chunkSize, + ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams, + ZDICT_fastCover_params_t* fastCoverParams, int optimize); + +diff -Nur zstd-1.5.0/programs/zstdcli.c new-zstd/programs/zstdcli.c +--- zstd-1.5.0/programs/zstdcli.c 2021-05-14 22:59:34.000000000 +0800 ++++ new-zstd/programs/zstdcli.c 2021-11-16 14:32:31.813357256 +0800 +@@ -1253,18 +1253,18 @@ + int const optimize = !coverParams.k || !coverParams.d; + coverParams.nbThreads = (unsigned)nbWorkers; + coverParams.zParams = zParams; +- operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (unsigned)filenames->tableSize, blockSize, NULL, &coverParams, NULL, optimize); ++ operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, &coverParams, NULL, optimize); + } else if (dict == fastCover) { + int const optimize = !fastCoverParams.k || !fastCoverParams.d; + fastCoverParams.nbThreads = (unsigned)nbWorkers; + fastCoverParams.zParams = zParams; +- operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (unsigned)filenames->tableSize, blockSize, NULL, NULL, &fastCoverParams, optimize); ++ operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, NULL, &fastCoverParams, optimize); + } else { + ZDICT_legacy_params_t dictParams; + memset(&dictParams, 0, sizeof(dictParams)); + dictParams.selectivityLevel = dictSelect; + dictParams.zParams = zParams; +- operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (unsigned)filenames->tableSize, blockSize, &dictParams, NULL, NULL, 0); ++ operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, &dictParams, NULL, NULL, 0); + } + #else + (void)dictCLevel; (void)dictSelect; (void)dictID; (void)maxDictSize; /* not used when ZSTD_NODICT set */ diff --git a/zstd.spec b/zstd.spec index 58c9889645a430b6075bc23b186ba87f089e4b54..7abed7fca2621a5d2515444ecba54d436bddf8d1 100644 --- a/zstd.spec +++ b/zstd.spec @@ -2,7 +2,7 @@ Name: zstd Version: 1.5.0 -Release: 5 +Release: 6 Summary: A fast lossless compression algorithm License: BSD and GPLv2 URL: https://github.com/facebook/zstd @@ -11,6 +11,8 @@ Source0: https://github.com/facebook/zstd/archive/v%{version}.tar.gz#/%{ Patch1: backport-zstd-1.5.0-patch-1-set-mtime-on-output-files.patch Patch2: backport-zstd-1.5.0-patch-2-add-tests-set-mtime-on-output-files.patch Patch3: backport-zstd-1.5.0-patch-3-remove-invalid-test.patch +Patch4: backport-zstd-1.5.0-patch-4-limit-train-samples.patch + BuildRequires: gtest-devel gcc-c++ pkg-config Provides: libzstd @@ -88,6 +90,9 @@ install -D -m644 programs/zstd.1 %{buildroot}%{_mandir}/man1/pzstd.1 %{_mandir}/man1/*.1* %changelog +* Tue Nov 16 2021 zhangxiao - 1.5.0.6 +* Limit train smaples + * Mon Nov 15 2021 zhangxiao - 1.5.0.5 * make the test in all archtectures