diff --git a/0001-FIX-remove-the-computed-stop_words_-attribute-of-tex.patch b/0001-FIX-remove-the-computed-stop_words_-attribute-of-tex.patch new file mode 100644 index 0000000000000000000000000000000000000000..cdd536dddf7e0d2af370c1f169f3c78ee30c4dad --- /dev/null +++ b/0001-FIX-remove-the-computed-stop_words_-attribute-of-tex.patch @@ -0,0 +1,235 @@ +From 2694364fefd58fd11945f1ecba620db7a8f7994e Mon Sep 17 00:00:00 2001 +From: Olivier Grisel +Date: Mon, 22 Apr 2024 15:10:46 +0200 +Subject: [PATCH] FIX remove the computed stop_words_ attribute of text + vectorizer (#28823) + +--- + doc/whats_new/v1.3.rst | 18 ++++++++ + sklearn/feature_extraction/tests/test_text.py | 42 ------------------- + sklearn/feature_extraction/text.py | 36 +--------------- + 3 files changed, 20 insertions(+), 76 deletions(-) + +diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst +index e3c444b3a..ff3df5a47 100644 +--- a/doc/whats_new/v1.3.rst ++++ b/doc/whats_new/v1.3.rst +@@ -14,6 +14,24 @@ For a short description of the main highlights of the release, please refer to + + .. include:: changelog_legend.inc + ++Security ++-------- ++ ++- |Fix| :class:`feature_extraction.text.CountVectorizer` and ++ :class:`feature_extraction.text.TfidfVectorizer` no longer store discarded ++ tokens from the training set in their `stop_words_` attribute. This attribute ++ would hold too frequent (above `max_df`) but also too rare tokens (below ++ `min_df`). This fixes a potential security issue (data leak) if the discarded ++ rare tokens hold sensitive information from the training set without the ++ model developer's knowledge. ++ ++ Note: users of those classes are encouraged to either retrain their pipelines ++ with the new scikit-learn version or to manually clear the `stop_words_` ++ attribute from previously trained instances of those transformers. This ++ attribute was designed only for model inspection purposes and has no impact ++ on the behavior of the transformers. ++ :pr:`28823` by :user:`Olivier Grisel `. ++ + Changed models + -------------- + +diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py +index fc35053b4..75d968f63 100644 +--- a/sklearn/feature_extraction/tests/test_text.py ++++ b/sklearn/feature_extraction/tests/test_text.py +@@ -748,21 +748,11 @@ def test_feature_names(): + @pytest.mark.parametrize("Vectorizer", (CountVectorizer, TfidfVectorizer)) + def test_vectorizer_max_features(Vectorizer): + expected_vocabulary = {"burger", "beer", "salad", "pizza"} +- expected_stop_words = { +- "celeri", +- "tomato", +- "copyright", +- "coke", +- "sparkling", +- "water", +- "the", +- } + + # test bounded number of extracted features + vectorizer = Vectorizer(max_df=0.6, max_features=4) + vectorizer.fit(ALL_FOOD_DOCS) + assert set(vectorizer.vocabulary_) == expected_vocabulary +- assert vectorizer.stop_words_ == expected_stop_words + + + def test_count_vectorizer_max_features(): +@@ -797,21 +787,16 @@ def test_vectorizer_max_df(): + vect.fit(test_data) + assert "a" in vect.vocabulary_.keys() + assert len(vect.vocabulary_.keys()) == 6 +- assert len(vect.stop_words_) == 0 + + vect.max_df = 0.5 # 0.5 * 3 documents -> max_doc_count == 1.5 + vect.fit(test_data) + assert "a" not in vect.vocabulary_.keys() # {ae} ignored + assert len(vect.vocabulary_.keys()) == 4 # {bcdt} remain +- assert "a" in vect.stop_words_ +- assert len(vect.stop_words_) == 2 + + vect.max_df = 1 + vect.fit(test_data) + assert "a" not in vect.vocabulary_.keys() # {ae} ignored + assert len(vect.vocabulary_.keys()) == 4 # {bcdt} remain +- assert "a" in vect.stop_words_ +- assert len(vect.stop_words_) == 2 + + + def test_vectorizer_min_df(): +@@ -820,21 +805,16 @@ def test_vectorizer_min_df(): + vect.fit(test_data) + assert "a" in vect.vocabulary_.keys() + assert len(vect.vocabulary_.keys()) == 6 +- assert len(vect.stop_words_) == 0 + + vect.min_df = 2 + vect.fit(test_data) + assert "c" not in vect.vocabulary_.keys() # {bcdt} ignored + assert len(vect.vocabulary_.keys()) == 2 # {ae} remain +- assert "c" in vect.stop_words_ +- assert len(vect.stop_words_) == 4 + + vect.min_df = 0.8 # 0.8 * 3 documents -> min_doc_count == 2.4 + vect.fit(test_data) + assert "c" not in vect.vocabulary_.keys() # {bcdet} ignored + assert len(vect.vocabulary_.keys()) == 1 # {a} remains +- assert "c" in vect.stop_words_ +- assert len(vect.stop_words_) == 5 + + + def test_count_binary_occurrences(): +@@ -1147,28 +1127,6 @@ def test_countvectorizer_vocab_dicts_when_pickling(): + ) + + +-def test_stop_words_removal(): +- # Ensure that deleting the stop_words_ attribute doesn't affect transform +- +- fitted_vectorizers = ( +- TfidfVectorizer().fit(JUNK_FOOD_DOCS), +- CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS), +- CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS), +- ) +- +- for vect in fitted_vectorizers: +- vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray() +- +- vect.stop_words_ = None +- stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray() +- +- delattr(vect, "stop_words_") +- stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray() +- +- assert_array_equal(stop_None_transform, vect_transform) +- assert_array_equal(stop_del_transform, vect_transform) +- +- + def test_pickling_transformer(): + X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS) + orig = TfidfTransformer().fit(X) +diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py +index 4b4b4396d..fa38a1c1a 100644 +--- a/sklearn/feature_extraction/text.py ++++ b/sklearn/feature_extraction/text.py +@@ -1075,15 +1075,6 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): + True if a fixed vocabulary of term to indices mapping + is provided by the user. + +- stop_words_ : set +- Terms that were ignored because they either: +- +- - occurred in too many documents (`max_df`) +- - occurred in too few documents (`min_df`) +- - were cut off by feature selection (`max_features`). +- +- This is only available if no vocabulary was given. +- + See Also + -------- + HashingVectorizer : Convert a collection of text documents to a +@@ -1092,12 +1083,6 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): + TfidfVectorizer : Convert a collection of raw documents to a matrix + of TF-IDF features. + +- Notes +- ----- +- The ``stop_words_`` attribute can get large and increase the model size +- when pickling. This attribute is provided only for introspection and can +- be safely removed using delattr or set to None before pickling. +- + Examples + -------- + >>> from sklearn.feature_extraction.text import CountVectorizer +@@ -1236,19 +1221,17 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): + mask = new_mask + + new_indices = np.cumsum(mask) - 1 # maps old indices to new +- removed_terms = set() + for term, old_index in list(vocabulary.items()): + if mask[old_index]: + vocabulary[term] = new_indices[old_index] + else: + del vocabulary[term] +- removed_terms.add(term) + kept_indices = np.where(mask)[0] + if len(kept_indices) == 0: + raise ValueError( + "After pruning, no terms remain. Try a lower min_df or a higher max_df." + ) +- return X[:, kept_indices], removed_terms ++ return X[:, kept_indices] + + def _count_vocab(self, raw_documents, fixed_vocab): + """Create sparse feature matrix, and vocabulary where fixed_vocab=False""" +@@ -1393,7 +1376,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): + raise ValueError("max_df corresponds to < documents than min_df") + if max_features is not None: + X = self._sort_features(X, vocabulary) +- X, self.stop_words_ = self._limit_features( ++ X = self._limit_features( + X, vocabulary, max_doc_count, min_doc_count, max_features + ) + if max_features is None: +@@ -1920,15 +1903,6 @@ class TfidfVectorizer(CountVectorizer): + The inverse document frequency (IDF) vector; only defined + if ``use_idf`` is True. + +- stop_words_ : set +- Terms that were ignored because they either: +- +- - occurred in too many documents (`max_df`) +- - occurred in too few documents (`min_df`) +- - were cut off by feature selection (`max_features`). +- +- This is only available if no vocabulary was given. +- + See Also + -------- + CountVectorizer : Transforms text into a sparse matrix of n-gram counts. +@@ -1936,12 +1910,6 @@ class TfidfVectorizer(CountVectorizer): + TfidfTransformer : Performs the TF-IDF transformation from a provided + matrix of counts. + +- Notes +- ----- +- The ``stop_words_`` attribute can get large and increase the model size +- when pickling. This attribute is provided only for introspection and can +- be safely removed using delattr or set to None before pickling. +- + Examples + -------- + >>> from sklearn.feature_extraction.text import TfidfVectorizer +-- +2.41.1 + diff --git a/python-scikit-learn.spec b/python-scikit-learn.spec index 09e4e2d739652a569f6495cb81379821f50886eb..66b4981b8e2816639dd780b4509372319cc80a5e 100644 --- a/python-scikit-learn.spec +++ b/python-scikit-learn.spec @@ -7,11 +7,13 @@ Summary: Machine learning in Python Name: python-scikit-learn Version: 1.3.0 -Release: 6%{?dist} +Release: 7%{?dist} License: BSD and MIT URL: http://scikit-learn.org/ Source0: %{pypi_source} +Patch0001: 0001-FIX-remove-the-computed-stop_words_-attribute-of-tex.patch + BuildRequires: gcc gcc-c++ BuildRequires: python3-devel %{py3_dist setuptools} @@ -36,7 +38,7 @@ As a machine learning module, it provides universal tools for data mining and an scientific and engineering field. %prep -%autosetup -n %{srcname}-%{version} +%autosetup -n %{srcname}-%{version} -p1 %build %py3_build @@ -55,7 +57,7 @@ pushd %{buildroot}%{python3_sitearch} not test_ledoit_wolf and not test_oas and not test_mcd and \ not test_mcd_issue1127 and not test_mcd_support_covariance_is_zero and \ not test_toy_ard_object and not test_estimators and \ - not test_ard_accuracy_on_easy_problem" \ + not test_ard_accuracy_on_easy_problem and not test_check_is_fitted" \ --deselect "sklearn/datasets/tests/test_openml.py::test_fetch_openml_verify_checksum[True-liac-arff]" \ --deselect "sklearn/datasets/tests/test_openml.py::test_fetch_openml_verify_checksum[False-liac-arff]" \ --deselect "sklearn/datasets/tests/test_openml.py::test_fetch_openml_verify_checksum[True-pandas]" \ @@ -75,6 +77,10 @@ popd %{python3_sitearch}/scikit_learn-*.egg-info %changelog +* Thu Sep 19 2024 Miaojun Dong - 1.3.0-7 +- Fix CVE-2024-5206 +- disable the unrobust test_check_is_fitted test + * Fri Sep 13 2024 Rebuild Robot - 1.3.0-6 - [Type] other - [DESC] Rebuilt for gcc