diff --git a/python/extractor/cli-integration-test/hidden-files/config.yml b/python/extractor/cli-integration-test/hidden-files/config.yml new file mode 100644 index 000000000000..69d94597d950 --- /dev/null +++ b/python/extractor/cli-integration-test/hidden-files/config.yml @@ -0,0 +1,3 @@ +name: Test Config +paths-ignore: + - "**/.*/**" diff --git a/python/extractor/cli-integration-test/hidden-files/query-default.expected b/python/extractor/cli-integration-test/hidden-files/query-default.expected new file mode 100644 index 000000000000..72d34a1ab0b0 --- /dev/null +++ b/python/extractor/cli-integration-test/hidden-files/query-default.expected @@ -0,0 +1,6 @@ +| name | ++-------------------------------+ +| .hidden_file.py | +| another_non_hidden.py | +| foo.py | +| visible_file_in_hidden_dir.py | diff --git a/python/extractor/cli-integration-test/hidden-files/query-skipped.expected b/python/extractor/cli-integration-test/hidden-files/query-skipped.expected new file mode 100644 index 000000000000..688dbe00d570 --- /dev/null +++ b/python/extractor/cli-integration-test/hidden-files/query-skipped.expected @@ -0,0 +1,4 @@ +| name | ++-----------------+ +| .hidden_file.py | +| foo.py | diff --git a/python/extractor/cli-integration-test/hidden-files/query.ql b/python/extractor/cli-integration-test/hidden-files/query.ql new file mode 100644 index 000000000000..3b1b3c03849b --- /dev/null +++ b/python/extractor/cli-integration-test/hidden-files/query.ql @@ -0,0 +1,3 @@ +import python + +select any(File f).getShortName() as name order by name diff --git a/python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_dir/internal_non_hidden/another_non_hidden.py b/python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_dir/internal_non_hidden/another_non_hidden.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_dir/visible_file_in_hidden_dir.py b/python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_dir/visible_file_in_hidden_dir.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_file.py b/python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_file.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/extractor/cli-integration-test/hidden-files/repo_dir/foo.py b/python/extractor/cli-integration-test/hidden-files/repo_dir/foo.py new file mode 100644 index 000000000000..517b47df53c2 --- /dev/null +++ b/python/extractor/cli-integration-test/hidden-files/repo_dir/foo.py @@ -0,0 +1 @@ +print(42) diff --git a/python/extractor/cli-integration-test/hidden-files/test.sh b/python/extractor/cli-integration-test/hidden-files/test.sh new file mode 100755 index 000000000000..45485985adbb --- /dev/null +++ b/python/extractor/cli-integration-test/hidden-files/test.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/ + +set -x + +CODEQL=${CODEQL:-codeql} + +SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +cd "$SCRIPTDIR" + +rm -rf db db-skipped + +# Test 1: Default behavior should be to extract files in hidden directories +$CODEQL database create db --language python --source-root repo_dir/ +$CODEQL query run --database db query.ql > query-default.actual +diff query-default.expected query-default.actual + +# Test 2: The default behavior can be overridden by setting `paths-ignore` in the config file +$CODEQL database create db-skipped --language python --source-root repo_dir/ --codescanning-config=config.yml +$CODEQL query run --database db-skipped query.ql > query-skipped.actual +diff query-skipped.expected query-skipped.actual + +rm -rf db db-skipped diff --git a/python/extractor/semmle/path_filters.py b/python/extractor/semmle/path_filters.py index cb1a4d9b8bca..908ec4c0ee0a 100644 --- a/python/extractor/semmle/path_filters.py +++ b/python/extractor/semmle/path_filters.py @@ -41,6 +41,9 @@ def glob_part_to_regex(glob, add_sep): def glob_to_regex(glob, prefix=""): '''Convert entire glob to a compiled regex''' + # When the glob ends in `/`, we need to remember this so that we don't accidentally add an + # extra separator to the final regex. + end_sep = "" if glob.endswith("/") else SEP glob = glob.strip().strip("/") parts = glob.split("/") #Trailing '**' is redundant, so strip it off. @@ -48,12 +51,17 @@ def glob_to_regex(glob, prefix=""): parts = parts[:-1] if not parts: return ".*" + # The `glob.strip("/")` call above will have removed all trailing slashes, but if there was at + # least one trailing slash, we want there to be an extra part, so we add it explicitly here in + # that case, using the emptyness of `end_sep` as a proxy. + if end_sep == "": + parts += [""] parts = [ glob_part_to_regex(escape(p), True) for p in parts[:-1] ] + [ glob_part_to_regex(escape(parts[-1]), False) ] # we need to escape the prefix, specifically because on windows the prefix will be # something like `C:\\folder\\subfolder\\` and without escaping the # backslash-path-separators will get interpreted as regex escapes (which might be # invalid sequences, causing the extractor to crash) - full_pattern = escape(prefix) + ''.join(parts) + "(?:" + SEP + ".*|$)" + full_pattern = escape(prefix) + ''.join(parts) + "(?:" + end_sep + ".*|$)" return re.compile(full_pattern) def filter_from_pattern(pattern, prev_filter, prefix): diff --git a/python/extractor/semmle/traverser.py b/python/extractor/semmle/traverser.py index ad8bd38ae735..4e316a075f75 100644 --- a/python/extractor/semmle/traverser.py +++ b/python/extractor/semmle/traverser.py @@ -83,46 +83,21 @@ def _treewalk(self, path): self.logger.debug("Ignoring %s (symlink)", fullpath) continue if isdir(fullpath): - if fullpath in self.exclude_paths or is_hidden(fullpath): - if is_hidden(fullpath): - self.logger.debug("Ignoring %s (hidden)", fullpath) - else: - self.logger.debug("Ignoring %s (excluded)", fullpath) - else: - empty = True - for item in self._treewalk(fullpath): - yield item - empty = False - if not empty: - yield fullpath + if fullpath in self.exclude_paths: + self.logger.debug("Ignoring %s (excluded)", fullpath) + continue + + empty = True + for item in self._treewalk(fullpath): + yield item + empty = False + if not empty: + yield fullpath elif self.filter(fullpath): yield fullpath else: self.logger.debug("Ignoring %s (filter)", fullpath) - -if os.name== 'nt': - import ctypes - - def is_hidden(path): - #Magical windows code - try: - attrs = ctypes.windll.kernel32.GetFileAttributesW(str(path)) - if attrs == -1: - return False - if attrs&2: - return True - except Exception: - #Not sure what to log here, probably best to carry on. - pass - return os.path.basename(path).startswith(".") - -else: - - def is_hidden(path): - return os.path.basename(path).startswith(".") - - def exclude_filter_from_options(options): if options.exclude_package: choices = '|'.join(mod.replace('.', r'\.') for mod in options.exclude_package) diff --git a/python/extractor/semmle/util.py b/python/extractor/semmle/util.py index e0720a86312b..56f7889ae231 100644 --- a/python/extractor/semmle/util.py +++ b/python/extractor/semmle/util.py @@ -10,7 +10,7 @@ #Semantic version of extractor. #Update this if any changes are made -VERSION = "7.1.2" +VERSION = "7.1.3" PY_EXTENSIONS = ".py", ".pyw" diff --git a/python/ql/lib/change-notes/2025-04-30-extract-hidden-files-by-default.md b/python/ql/lib/change-notes/2025-04-30-extract-hidden-files-by-default.md new file mode 100644 index 000000000000..32b272215af7 --- /dev/null +++ b/python/ql/lib/change-notes/2025-04-30-extract-hidden-files-by-default.md @@ -0,0 +1,5 @@ +--- +category: minorAnalysis +--- + +- The Python extractor now extracts files in hidden directories by default. If you would like to skip hidden files, add `paths-ignore: ["**/.*/**"]` to your [Code Scanning config](https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning#specifying-directories-to-scan). When using the CodeQL CLI for extraction, specify the configuration (creating the configuration file if necessary) using the `--codescanning-config` option. diff --git a/python/ql/test/2/extractor-tests/hidden/test.expected b/python/ql/test/2/extractor-tests/hidden/test.expected index ca72363d8f02..21bd0dfb2dd9 100644 --- a/python/ql/test/2/extractor-tests/hidden/test.expected +++ b/python/ql/test/2/extractor-tests/hidden/test.expected @@ -1,3 +1,5 @@ +| .hidden/inner/test.py | +| .hidden/module.py | | folder/module.py | | package | | package/__init__.py | diff --git a/python/ql/test/extractor-tests/filter-option/Test.expected b/python/ql/test/extractor-tests/filter-option/Test.expected index 7ade39a5998c..56b1e36c2a93 100644 --- a/python/ql/test/extractor-tests/filter-option/Test.expected +++ b/python/ql/test/extractor-tests/filter-option/Test.expected @@ -3,3 +3,4 @@ | Module foo.bar | | Module foo.include_test | | Package foo | +| Script hidden_foo.py |