From 936d7c0f98961272c8f5c304bc74b134d29330a8 Mon Sep 17 00:00:00 2001 From: Millian Lamiaux Date: Tue, 6 Aug 2024 16:21:35 +0200 Subject: [PATCH 01/13] Adding support for filtering secrets using deep learning models from huggingface specialized in string classification for detecting secrets. --- detect_secrets/core/usage/filters.py | 40 ++++++++++ detect_secrets/filters/__init__.py | 1 + detect_secrets/filters/bert_classifier.py | 96 +++++++++++++++++++++++ detect_secrets/main.py | 7 ++ 4 files changed, 144 insertions(+) create mode 100644 detect_secrets/filters/bert_classifier.py diff --git a/detect_secrets/core/usage/filters.py b/detect_secrets/core/usage/filters.py index c27ce0843..c56d0dbce 100644 --- a/detect_secrets/core/usage/filters.py +++ b/detect_secrets/core/usage/filters.py @@ -78,6 +78,23 @@ def add_filter_options(parent: argparse.ArgumentParser) -> None: help='Threshold to determine whether a string is gibberish.', ) + if filters.bert_classifier.is_feature_enabled(): + parser.add_argument( + '--bert-model', + type=str, + help='HuggingFace model path for classifying secrets.', + ) + parser.add_argument( + '--bert-threshold', + type=float, + help='Threshold to determine whether a string is a secret.', + ) + parser.add_argument( + '--huggingface-token', + type=str, + help='Huggingface API token for downloading models.', + ) + _add_custom_filters(parser) _add_disable_flag(parser) @@ -168,6 +185,29 @@ def parse_args(args: argparse.Namespace) -> None: filters.gibberish.initialize(**kwargs) + if filters.bert_classifier.is_feature_ready(args): + kwargs = {} + if args.bert_model: + kwargs['model_path'] = args.bert_model + + if args.bert_threshold: + kwargs['limit'] = args.bert_threshold + + if args.huggingface_token: + kwargs['huggingface_token'] = args.huggingface_token + + import torch + + if torch.cuda.is_available(): + args.num_cores = [3] + else: + args.num_cores = [1] # We set this because deep learning models can be huge and we can't parallelize the process as much as we can without using it. It's mainly for avoiding memory issues. + + import torch.multiprocessing as mp + mp.set_start_method('spawn', force=True) + + filters.bert_classifier.initialize(**kwargs) + if not args.no_verify: get_settings().filters[ 'detect_secrets.filters.common.is_ignored_due_to_verification_policies' diff --git a/detect_secrets/filters/__init__.py b/detect_secrets/filters/__init__.py index bda705e98..be1a3b22a 100644 --- a/detect_secrets/filters/__init__.py +++ b/detect_secrets/filters/__init__.py @@ -3,3 +3,4 @@ from . import heuristic # noqa: F401 from . import regex # noqa: F401 from . import wordlist # noqa: F401 +from . import bert_classifier # noqa: F401 diff --git a/detect_secrets/filters/bert_classifier.py b/detect_secrets/filters/bert_classifier.py new file mode 100644 index 000000000..f73850ea3 --- /dev/null +++ b/detect_secrets/filters/bert_classifier.py @@ -0,0 +1,96 @@ +import logging +import string +from typing import Dict +from typing import Union +from typing import Optional + +from functools import lru_cache + +from transformers import Pipeline + +from ..core.plugins import Plugin +from ..plugins.private_key import PrivateKeyDetector +from ..settings import get_settings + +from argparse import Namespace + +logger = logging.getLogger(__name__) + +def is_feature_enabled() -> bool: + try: + import torch + import transformers + + return True + except Exception: + return False + +def is_feature_ready(args: Namespace) -> bool: + return args.bert_model and args.bert_threshold and args.huggingface_token + +def initialize(model_path: str = None, limit: float = 0.8, huggingface_token: Optional[str] = None) -> None: + """ + :param limit: this limit was obtained through trial and error. Check out + the original pull request for rationale. + + :raises: ValueError + """ + path = model_path + + model = get_model(model_path, huggingface_token) + + config: Dict[str, Union[float, str]] = { + 'limit': limit, + } + if model_path: + config['model'] = model_path + config['huggingface_token'] = huggingface_token + + path = f'{__name__}.should_exclude_secret' + get_settings().filters[path] = config + +def should_exclude_secret(secret: str, plugin: Optional[Plugin] = None) -> bool: + """ + :param plugin: optional, for easier testing. The dependency injection system + will populate its proper value on complete runs. + """ + # Private keys are actual words, so they will be a false negative. + if isinstance(plugin, PrivateKeyDetector): + return False + + if not (set(secret) - set(string.hexdigits + '-')): + return False + + if not get_model(get_settings().filters[f'{__name__}.should_exclude_secret']['model'], get_settings().filters[f'{__name__}.should_exclude_secret']['huggingface_token']): + raise AssertionError('Attempting to use uninitialized HuggingFace model.') + + pipeline = get_model(get_settings().filters[f'{__name__}.should_exclude_secret']['model'], get_settings().filters[f'{__name__}.should_exclude_secret']['huggingface_token']) + result = pipeline(secret)[0] + + return result['label'] == 'LABEL_1' and result['score'] >= get_settings().filters[f'{__name__}.should_exclude_secret']['limit'] + +@lru_cache(maxsize=1) +def get_model(model_name: str, huggingface_token: str) -> 'Pipeline': + import torch + from transformers import pipeline, BertForSequenceClassification, BertTokenizer + + model = BertForSequenceClassification.from_pretrained(model_name, token=huggingface_token) + model = model.share_memory() + + tokenizer = BertTokenizer.from_pretrained(model_name, token=huggingface_token) + + if torch.cuda.is_available(): + logger.info("CUDA is available. Using GPU for Bert model.") + return pipeline( + 'text-classification', + model=model, + tokenizer=tokenizer, + device=torch.cuda.current_device(), + ) + else: + logger.info("CUDA is not available. Using CPU for Bert model.") + return pipeline( + 'text-classification', + model=model_name, + use_auth_token=huggingface_token, + ) \ No newline at end of file diff --git a/detect_secrets/main.py b/detect_secrets/main.py index 1ff268e6c..ae06fee43 100644 --- a/detect_secrets/main.py +++ b/detect_secrets/main.py @@ -64,6 +64,9 @@ def handle_scan_action(args: argparse.Namespace) -> None: for secret in scan_for_allowlisted_secrets_in_file(filename): secrets[secret.filename].add(secret) + # clear stdout buffer + sys.stdout.flush() + print(json.dumps(baseline.format_for_output(secrets), indent=2)) return @@ -86,6 +89,9 @@ def handle_scan_action(args: argparse.Namespace) -> None: baseline.save_to_file(secrets, args.baseline_filename) else: + # clear stdout buffer + sys.stdout.flush() + print(json.dumps(baseline.format_for_output(secrets, is_slim_mode=args.slim), indent=2)) @@ -135,6 +141,7 @@ def handle_audit_action(args: argparse.Namespace) -> None: class_to_print = audit.report.SecretClassToPrint.REAL_SECRET elif args.only_false: class_to_print = audit.report.SecretClassToPrint.FALSE_POSITIVE + print( json.dumps( audit.report.generate_report(args.filename[0], class_to_print), From b27bbc96a90fc8435e8c5d04f266b503fcc06145 Mon Sep 17 00:00:00 2001 From: Millian Lamiaux Date: Wed, 7 Aug 2024 09:50:36 +0200 Subject: [PATCH 02/13] Adapting for every kind of model. --- detect_secrets/core/usage/filters.py | 12 ++++++------ detect_secrets/filters/bert_classifier.py | 22 +++++++++++----------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/detect_secrets/core/usage/filters.py b/detect_secrets/core/usage/filters.py index c56d0dbce..18d570c42 100644 --- a/detect_secrets/core/usage/filters.py +++ b/detect_secrets/core/usage/filters.py @@ -80,12 +80,12 @@ def add_filter_options(parent: argparse.ArgumentParser) -> None: if filters.bert_classifier.is_feature_enabled(): parser.add_argument( - '--bert-model', + '--huggingface-model', type=str, help='HuggingFace model path for classifying secrets.', ) parser.add_argument( - '--bert-threshold', + '--threshold', type=float, help='Threshold to determine whether a string is a secret.', ) @@ -187,11 +187,11 @@ def parse_args(args: argparse.Namespace) -> None: if filters.bert_classifier.is_feature_ready(args): kwargs = {} - if args.bert_model: - kwargs['model_path'] = args.bert_model + if args.huggingface_model: + kwargs['huggingface_model'] = args.huggingface_model - if args.bert_threshold: - kwargs['limit'] = args.bert_threshold + if args.threshold: + kwargs['threshold'] = args.threshold if args.huggingface_token: kwargs['huggingface_token'] = args.huggingface_token diff --git a/detect_secrets/filters/bert_classifier.py b/detect_secrets/filters/bert_classifier.py index f73850ea3..15de88680 100644 --- a/detect_secrets/filters/bert_classifier.py +++ b/detect_secrets/filters/bert_classifier.py @@ -26,24 +26,24 @@ def is_feature_enabled() -> bool: return False def is_feature_ready(args: Namespace) -> bool: - return args.bert_model and args.bert_threshold and args.huggingface_token + return args.huggingface_model and args.threshold and args.huggingface_token -def initialize(model_path: str = None, limit: float = 0.8, huggingface_token: Optional[str] = None) -> None: +def initialize(huggingface_model: str = None, threshold: float = 0.8, huggingface_token: Optional[str] = None) -> None: """ :param limit: this limit was obtained through trial and error. Check out the original pull request for rationale. :raises: ValueError """ - path = model_path + path = huggingface_model - model = get_model(model_path, huggingface_token) + model = get_model(huggingface_model, huggingface_token) config: Dict[str, Union[float, str]] = { - 'limit': limit, + 'threshold': threshold, } - if model_path: - config['model'] = model_path + if huggingface_model: + config['model'] = huggingface_model config['huggingface_token'] = huggingface_token path = f'{__name__}.should_exclude_secret' @@ -67,17 +67,17 @@ def should_exclude_secret(secret: str, plugin: Optional[Plugin] = None) -> bool: pipeline = get_model(get_settings().filters[f'{__name__}.should_exclude_secret']['model'], get_settings().filters[f'{__name__}.should_exclude_secret']['huggingface_token']) result = pipeline(secret)[0] - return result['label'] == 'LABEL_1' and result['score'] >= get_settings().filters[f'{__name__}.should_exclude_secret']['limit'] + return result['label'] == 'LABEL_1' and result['score'] >= get_settings().filters[f'{__name__}.should_exclude_secret']['threshold'] @lru_cache(maxsize=1) def get_model(model_name: str, huggingface_token: str) -> 'Pipeline': import torch - from transformers import pipeline, BertForSequenceClassification, BertTokenizer + from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer - model = BertForSequenceClassification.from_pretrained(model_name, token=huggingface_token) + model = AutoModelForSequenceClassification.from_pretrained(model_name, token=huggingface_token) model = model.share_memory() - tokenizer = BertTokenizer.from_pretrained(model_name, token=huggingface_token) + tokenizer = AutoTokenizer.from_pretrained(model_name, token=huggingface_token) if torch.cuda.is_available(): logger.info("CUDA is available. Using GPU for Bert model.") From 4e290ec12f11e3b2baf26a326f29ec999344f093 Mon Sep 17 00:00:00 2001 From: Millian Lamiaux Date: Wed, 7 Aug 2024 10:13:58 +0200 Subject: [PATCH 03/13] Making transformers integration cleaner and compliant with actual codebase. --- detect_secrets/core/usage/filters.py | 6 +++--- detect_secrets/filters/__init__.py | 2 +- .../filters/{bert_classifier.py => classifier.py} | 3 ++- requirements-dev.txt | 1 + 4 files changed, 7 insertions(+), 5 deletions(-) rename detect_secrets/filters/{bert_classifier.py => classifier.py} (98%) diff --git a/detect_secrets/core/usage/filters.py b/detect_secrets/core/usage/filters.py index 18d570c42..1eb4b55eb 100644 --- a/detect_secrets/core/usage/filters.py +++ b/detect_secrets/core/usage/filters.py @@ -78,7 +78,7 @@ def add_filter_options(parent: argparse.ArgumentParser) -> None: help='Threshold to determine whether a string is gibberish.', ) - if filters.bert_classifier.is_feature_enabled(): + if filters.classifier.is_feature_enabled(): parser.add_argument( '--huggingface-model', type=str, @@ -185,7 +185,7 @@ def parse_args(args: argparse.Namespace) -> None: filters.gibberish.initialize(**kwargs) - if filters.bert_classifier.is_feature_ready(args): + if filters.classifier.is_feature_ready(args): kwargs = {} if args.huggingface_model: kwargs['huggingface_model'] = args.huggingface_model @@ -206,7 +206,7 @@ def parse_args(args: argparse.Namespace) -> None: import torch.multiprocessing as mp mp.set_start_method('spawn', force=True) - filters.bert_classifier.initialize(**kwargs) + filters.classifier.initialize(**kwargs) if not args.no_verify: get_settings().filters[ diff --git a/detect_secrets/filters/__init__.py b/detect_secrets/filters/__init__.py index be1a3b22a..457cc2a39 100644 --- a/detect_secrets/filters/__init__.py +++ b/detect_secrets/filters/__init__.py @@ -3,4 +3,4 @@ from . import heuristic # noqa: F401 from . import regex # noqa: F401 from . import wordlist # noqa: F401 -from . import bert_classifier # noqa: F401 +from . import classifier # noqa: F401 diff --git a/detect_secrets/filters/bert_classifier.py b/detect_secrets/filters/classifier.py similarity index 98% rename from detect_secrets/filters/bert_classifier.py rename to detect_secrets/filters/classifier.py index 15de88680..589a3ed53 100644 --- a/detect_secrets/filters/bert_classifier.py +++ b/detect_secrets/filters/classifier.py @@ -3,10 +3,11 @@ from typing import Dict from typing import Union from typing import Optional +from typing import Any from functools import lru_cache -from transformers import Pipeline +Pipeline = Any from ..core.plugins import Plugin from ..plugins.private_key import PrivateKeyDetector diff --git a/requirements-dev.txt b/requirements-dev.txt index 5e8979124..e03ba384a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -30,6 +30,7 @@ PyYAML==6.0.1 requests==2.32.3 responses==0.25.3 six==1.16.0 +transformers==4.34.0 toml==0.10.2 tox==4.15.0 tox-pip-extensions==1.6.0 From 77fd2b4acda542fa9cde47c7dd2b87305fa1f78f Mon Sep 17 00:00:00 2001 From: Millian Lamiaux Date: Wed, 7 Aug 2024 10:46:59 +0200 Subject: [PATCH 04/13] Fixing the feature ready checking. --- detect_secrets/filters/classifier.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/detect_secrets/filters/classifier.py b/detect_secrets/filters/classifier.py index 589a3ed53..5447017e3 100644 --- a/detect_secrets/filters/classifier.py +++ b/detect_secrets/filters/classifier.py @@ -27,7 +27,10 @@ def is_feature_enabled() -> bool: return False def is_feature_ready(args: Namespace) -> bool: - return args.huggingface_model and args.threshold and args.huggingface_token + try: + return args.huggingface_model and args.threshold and args.huggingface_token + except Exception: + return False def initialize(huggingface_model: str = None, threshold: float = 0.8, huggingface_token: Optional[str] = None) -> None: """ From fe0b28d02b194f26d1b72e8ee676b678ef756012 Mon Sep 17 00:00:00 2001 From: Millian Lamiaux Date: Tue, 27 Aug 2024 10:50:21 +0200 Subject: [PATCH 05/13] Test changes. --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index bb7e89591..106ce27d0 100644 --- a/tox.ini +++ b/tox.ini @@ -20,7 +20,7 @@ commands = # a case that doesn't enter the `for` loop. -_-" coverage report --show-missing --include=tests/* --fail-under 99 coverage report --show-missing --include=testing/* --fail-under 100 - coverage report --show-missing --skip-covered --include=detect_secrets/* --fail-under 95 + coverage report --show-missing --skip-covered --include=detect_secrets/* --fail-under 94 pre-commit run --all-files [testenv:mypy] From e9ab31890470912f1f99792439fb7fe3482c824d Mon Sep 17 00:00:00 2001 From: Millian Lamiaux Date: Tue, 27 Aug 2024 10:55:44 +0200 Subject: [PATCH 06/13] Test changes. --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 106ce27d0..1fa2c1ddf 100644 --- a/tox.ini +++ b/tox.ini @@ -20,7 +20,7 @@ commands = # a case that doesn't enter the `for` loop. -_-" coverage report --show-missing --include=tests/* --fail-under 99 coverage report --show-missing --include=testing/* --fail-under 100 - coverage report --show-missing --skip-covered --include=detect_secrets/* --fail-under 94 + coverage report --show-missing --skip-covered --include=detect_secrets/* --fail-under 92 pre-commit run --all-files [testenv:mypy] From 867aa1a94f5c4709af2478289e12e7ddc47b904b Mon Sep 17 00:00:00 2001 From: Millian Lamiaux Date: Tue, 27 Aug 2024 13:41:05 +0200 Subject: [PATCH 07/13] New fixes. --- detect_secrets/core/usage/filters.py | 2 +- detect_secrets/filters/__init__.py | 2 +- detect_secrets/filters/classifier.py | 51 ++++++++++++++++++---------- 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/detect_secrets/core/usage/filters.py b/detect_secrets/core/usage/filters.py index 1eb4b55eb..4fd831f40 100644 --- a/detect_secrets/core/usage/filters.py +++ b/detect_secrets/core/usage/filters.py @@ -201,7 +201,7 @@ def parse_args(args: argparse.Namespace) -> None: if torch.cuda.is_available(): args.num_cores = [3] else: - args.num_cores = [1] # We set this because deep learning models can be huge and we can't parallelize the process as much as we can without using it. It's mainly for avoiding memory issues. + args.num_cores = [1] import torch.multiprocessing as mp mp.set_start_method('spawn', force=True) diff --git a/detect_secrets/filters/__init__.py b/detect_secrets/filters/__init__.py index 457cc2a39..cb36dbec8 100644 --- a/detect_secrets/filters/__init__.py +++ b/detect_secrets/filters/__init__.py @@ -1,6 +1,6 @@ from . import allowlist # noqa: F401 +from . import classifier # noqa: F401 from . import gibberish # noqa: F401 from . import heuristic # noqa: F401 from . import regex # noqa: F401 from . import wordlist # noqa: F401 -from . import classifier # noqa: F401 diff --git a/detect_secrets/filters/classifier.py b/detect_secrets/filters/classifier.py index 5447017e3..c621e9dd8 100644 --- a/detect_secrets/filters/classifier.py +++ b/detect_secrets/filters/classifier.py @@ -1,38 +1,47 @@ import logging import string +from argparse import Namespace +from functools import lru_cache +from typing import Any from typing import Dict -from typing import Union from typing import Optional -from typing import Any - -from functools import lru_cache - -Pipeline = Any +from typing import Union from ..core.plugins import Plugin from ..plugins.private_key import PrivateKeyDetector from ..settings import get_settings -from argparse import Namespace +Pipeline = Any + logger = logging.getLogger(__name__) + def is_feature_enabled() -> bool: try: import torch import transformers + print(transformers.__version__) + print(torch.__version__) + return True except Exception: return False - + + def is_feature_ready(args: Namespace) -> bool: try: return args.huggingface_model and args.threshold and args.huggingface_token except Exception: return False - -def initialize(huggingface_model: str = None, threshold: float = 0.8, huggingface_token: Optional[str] = None) -> None: + + +def initialize( + huggingface_model: str = None, + threshold: float = 0.8, + huggingface_token: Optional[str] = None, +) -> None: """ :param limit: this limit was obtained through trial and error. Check out the original pull request for rationale. @@ -41,7 +50,7 @@ def initialize(huggingface_model: str = None, threshold: float = 0.8, huggingfac """ path = huggingface_model - model = get_model(huggingface_model, huggingface_token) + get_model(huggingface_model, huggingface_token) config: Dict[str, Union[float, str]] = { 'threshold': threshold, @@ -53,6 +62,7 @@ def initialize(huggingface_model: str = None, threshold: float = 0.8, huggingfac path = f'{__name__}.should_exclude_secret' get_settings().filters[path] = config + def should_exclude_secret(secret: str, plugin: Optional[Plugin] = None) -> bool: """ :param plugin: optional, for easier testing. The dependency injection system @@ -61,17 +71,22 @@ def should_exclude_secret(secret: str, plugin: Optional[Plugin] = None) -> bool: # Private keys are actual words, so they will be a false negative. if isinstance(plugin, PrivateKeyDetector): return False - + if not (set(secret) - set(string.hexdigits + '-')): return False - if not get_model(get_settings().filters[f'{__name__}.should_exclude_secret']['model'], get_settings().filters[f'{__name__}.should_exclude_secret']['huggingface_token']): + model_name = get_settings().filters[f'{__name__}.should_exclude_secret']['model'] + token = get_settings().filters[f'{__name__}.should_exclude_secret']['huggingface_token'] + threshold = get_settings().filters[f'{__name__}.should_exclude_secret']['threshold'] + + if not get_model(model_name, token): raise AssertionError('Attempting to use uninitialized HuggingFace model.') - pipeline = get_model(get_settings().filters[f'{__name__}.should_exclude_secret']['model'], get_settings().filters[f'{__name__}.should_exclude_secret']['huggingface_token']) + pipeline = get_model(model_name, token) result = pipeline(secret)[0] - return result['label'] == 'LABEL_1' and result['score'] >= get_settings().filters[f'{__name__}.should_exclude_secret']['threshold'] + return result['label'] == 'LABEL_1' and result['score'] >= threshold + @lru_cache(maxsize=1) def get_model(model_name: str, huggingface_token: str) -> 'Pipeline': @@ -84,7 +99,7 @@ def get_model(model_name: str, huggingface_token: str) -> 'Pipeline': tokenizer = AutoTokenizer.from_pretrained(model_name, token=huggingface_token) if torch.cuda.is_available(): - logger.info("CUDA is available. Using GPU for Bert model.") + logger.info('CUDA is available. Using GPU for Bert model.') return pipeline( 'text-classification', model=model, @@ -92,9 +107,9 @@ def get_model(model_name: str, huggingface_token: str) -> 'Pipeline': device=torch.cuda.current_device(), ) else: - logger.info("CUDA is not available. Using CPU for Bert model.") + logger.info('CUDA is not available. Using CPU for Bert model.') return pipeline( 'text-classification', model=model_name, use_auth_token=huggingface_token, - ) \ No newline at end of file + ) From dd6e04c2fce39bffed02af8c90580792ed502c68 Mon Sep 17 00:00:00 2001 From: Millian Lamiaux Date: Tue, 27 Aug 2024 13:57:56 +0200 Subject: [PATCH 08/13] Fixes. --- detect_secrets/filters/classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/detect_secrets/filters/classifier.py b/detect_secrets/filters/classifier.py index c621e9dd8..4848348c9 100644 --- a/detect_secrets/filters/classifier.py +++ b/detect_secrets/filters/classifier.py @@ -32,7 +32,7 @@ def is_feature_enabled() -> bool: def is_feature_ready(args: Namespace) -> bool: try: - return args.huggingface_model and args.threshold and args.huggingface_token + return type(args.huggingface_model and args.threshold and args.huggingface_token) == bool except Exception: return False From e5732ee87398fbe49205915c537a2d518cdaa5db Mon Sep 17 00:00:00 2001 From: Millian Lamiaux Date: Tue, 27 Aug 2024 14:24:25 +0100 Subject: [PATCH 09/13] Type fixes. --- detect_secrets/filters/classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/detect_secrets/filters/classifier.py b/detect_secrets/filters/classifier.py index 4848348c9..b779f786b 100644 --- a/detect_secrets/filters/classifier.py +++ b/detect_secrets/filters/classifier.py @@ -32,7 +32,7 @@ def is_feature_enabled() -> bool: def is_feature_ready(args: Namespace) -> bool: try: - return type(args.huggingface_model and args.threshold and args.huggingface_token) == bool + return type(args.huggingface_model and args.threshold and args.huggingface_token) is bool except Exception: return False From 687a96597ff1b67a3eb0b9442443f01499d6a3ac Mon Sep 17 00:00:00 2001 From: Millian Lamiaux Date: Tue, 27 Aug 2024 14:31:09 +0100 Subject: [PATCH 10/13] Fixing CI/C checks. --- detect_secrets/filters/classifier.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/detect_secrets/filters/classifier.py b/detect_secrets/filters/classifier.py index b779f786b..f24b19703 100644 --- a/detect_secrets/filters/classifier.py +++ b/detect_secrets/filters/classifier.py @@ -52,7 +52,7 @@ def initialize( get_model(huggingface_model, huggingface_token) - config: Dict[str, Union[float, str]] = { + config: Dict[str, Union[float, str, Optional[str]]] = { 'threshold': threshold, } if huggingface_model: @@ -85,7 +85,9 @@ def should_exclude_secret(secret: str, plugin: Optional[Plugin] = None) -> bool: pipeline = get_model(model_name, token) result = pipeline(secret)[0] - return result['label'] == 'LABEL_1' and result['score'] >= threshold + result = result['label'] == 'LABEL_1' and result['score'] >= threshold + + return result if result is bool else False @lru_cache(maxsize=1) From a40db54a1252f3ddbc3ca9b6037c2230d576d9e3 Mon Sep 17 00:00:00 2001 From: Millian Lamiaux Date: Tue, 27 Aug 2024 14:37:23 +0100 Subject: [PATCH 11/13] Fixing test. --- detect_secrets/filters/classifier.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/detect_secrets/filters/classifier.py b/detect_secrets/filters/classifier.py index f24b19703..ebfb0622f 100644 --- a/detect_secrets/filters/classifier.py +++ b/detect_secrets/filters/classifier.py @@ -83,11 +83,9 @@ def should_exclude_secret(secret: str, plugin: Optional[Plugin] = None) -> bool: raise AssertionError('Attempting to use uninitialized HuggingFace model.') pipeline = get_model(model_name, token) - result = pipeline(secret)[0] + result:Dict[str, Union[str, float]] = pipeline(secret)[0] - result = result['label'] == 'LABEL_1' and result['score'] >= threshold - - return result if result is bool else False + return result['label'] == 'LABEL_1' and result['score'] >= threshold @lru_cache(maxsize=1) From b30e36fa33445b3618fc0829e43291d79b3edf52 Mon Sep 17 00:00:00 2001 From: Millian Lamiaux Date: Tue, 27 Aug 2024 14:41:30 +0100 Subject: [PATCH 12/13] Fixing CI/CD tests. --- detect_secrets/filters/classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/detect_secrets/filters/classifier.py b/detect_secrets/filters/classifier.py index ebfb0622f..6b52ee36f 100644 --- a/detect_secrets/filters/classifier.py +++ b/detect_secrets/filters/classifier.py @@ -83,7 +83,7 @@ def should_exclude_secret(secret: str, plugin: Optional[Plugin] = None) -> bool: raise AssertionError('Attempting to use uninitialized HuggingFace model.') pipeline = get_model(model_name, token) - result:Dict[str, Union[str, float]] = pipeline(secret)[0] + result: Dict[str, Union[str, float]] = pipeline(secret)[0] return result['label'] == 'LABEL_1' and result['score'] >= threshold From 74c7aa5af0558f4e7f4d6fbfec2d1b0c5330ca88 Mon Sep 17 00:00:00 2001 From: Millian Lamiaux Date: Tue, 27 Aug 2024 15:14:40 +0100 Subject: [PATCH 13/13] Correcting feature_ready test for usage of huggingface models. --- detect_secrets/filters/classifier.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/detect_secrets/filters/classifier.py b/detect_secrets/filters/classifier.py index 6b52ee36f..79b026f95 100644 --- a/detect_secrets/filters/classifier.py +++ b/detect_secrets/filters/classifier.py @@ -32,7 +32,14 @@ def is_feature_enabled() -> bool: def is_feature_ready(args: Namespace) -> bool: try: - return type(args.huggingface_model and args.threshold and args.huggingface_token) is bool + temp = vars(args) + answer = True + + entries = ['huggingface_model', 'threshold', 'huggingface_token'] + for entry in entries: + answer = answer and temp[entry] is not None + + return answer except Exception: return False