diff --git a/bigquery_etl/backfill/utils.py b/bigquery_etl/backfill/utils.py index 5e55d8a9fd9..973ef9f9492 100644 --- a/bigquery_etl/backfill/utils.py +++ b/bigquery_etl/backfill/utils.py @@ -2,6 +2,7 @@ import re import sys +from glob import glob from pathlib import Path from typing import Dict, List, Tuple @@ -60,7 +61,9 @@ def get_qualified_table_name_to_entries_map_by_project( backfills_dict: dict = {} search_path = Path(sql_dir) / project_id - backfill_files = Path(search_path).rglob(BACKFILL_FILE) + backfill_files = map( + Path, glob(f"{search_path}/**/{BACKFILL_FILE}", recursive=True) + ) for backfill_file in backfill_files: project, dataset, table = extract_from_query_path(backfill_file) diff --git a/bigquery_etl/cli/query.py b/bigquery_etl/cli/query.py index a5bfafbc9fa..80c11f6f270 100644 --- a/bigquery_etl/cli/query.py +++ b/bigquery_etl/cli/query.py @@ -12,6 +12,7 @@ import tempfile from datetime import date, timedelta from functools import partial +from glob import glob from multiprocessing.pool import Pool, ThreadPool from pathlib import Path from tempfile import NamedTemporaryFile @@ -1388,7 +1389,9 @@ def _initialize(query_file): table = None sql_content = query_file.read_text() - init_files = list(Path(query_file.parent).rglob("init.sql")) + init_files = list( + map(Path, glob(f"{query_file.parent}/**/init.sql", recursive=True)) + ) # check if the provided file can be initialized and whether existing ones should be skipped if "is_init()" in sql_content or len(init_files) > 0: diff --git a/bigquery_etl/cli/routine.py b/bigquery_etl/cli/routine.py index f4390418ca6..55514b135ed 100644 --- a/bigquery_etl/cli/routine.py +++ b/bigquery_etl/cli/routine.py @@ -7,6 +7,7 @@ import string import sys from fnmatch import fnmatchcase +from glob import glob from pathlib import Path import pytest @@ -41,7 +42,7 @@ def _routines_matching_name_pattern(pattern, sql_path, project_id): if project_id is not None: sql_path = sql_path / project_id - all_sql_files = Path(sql_path).rglob("*.sql") + all_sql_files = map(Path, glob(f"{sql_path}/**/*.sql", recursive=True)) routine_files = [] for sql_file in all_sql_files: @@ -294,7 +295,9 @@ def info(ctx, name, sql_dir, project_id, usages): # find routine usages in SQL files click.echo("usages: ") sql_files = [ - p for project in project_dirs() for p in Path(project).rglob("*.sql") + p + for project in project_dirs() + for p in map(Path, glob(f"{project}/**/*.sql", recursive=True)) ] for sql_file in sql_files: if f"{routine_dataset}.{routine_name}" in sql_file.read_text(): @@ -518,9 +521,11 @@ def rename(ctx, name, new_name, sql_dir, project_id): shutil.move(source, destination) # replace usages - all_sql_files = list( - [p for project in project_dirs() for p in Path(project).rglob("*.sql")] - ) + all_sql_files = [ + p + for project in project_dirs() + for p in map(Path, glob(f"{project}/**/*.sql", recursive=True)) + ] for sql_file in all_sql_files: sql = sql_file.read_text() diff --git a/bigquery_etl/cli/stage.py b/bigquery_etl/cli/stage.py index f217212f2fc..889de329ab1 100644 --- a/bigquery_etl/cli/stage.py +++ b/bigquery_etl/cli/stage.py @@ -4,6 +4,7 @@ import shutil import tempfile from datetime import datetime +from glob import glob from multiprocessing.pool import ThreadPool from pathlib import Path @@ -160,7 +161,9 @@ def deploy( shutil.rmtree(test_path) # rename test files - for test_file_path in test_destination.glob("**/*"): + for test_file_path in map( + Path, glob(f"{test_destination}/**/*", recursive=True) + ): for test_dep_file in artifact_files: test_project = test_dep_file.parent.parent.parent.name test_dataset = test_dep_file.parent.parent.name @@ -330,7 +333,7 @@ def _update_references(artifact_files, project_id, dataset_suffix, sql_dir): ), ] - for path in Path(sql_dir).rglob("*.sql"): + for path in map(Path, glob(f"{sql_dir}/**/*.sql", recursive=True)): # apply substitutions if path.is_file(): sql = render(path.name, template_folder=path.parent, format=False) diff --git a/bigquery_etl/cli/utils.py b/bigquery_etl/cli/utils.py index ab5cb58ba78..360a40edf3a 100644 --- a/bigquery_etl/cli/utils.py +++ b/bigquery_etl/cli/utils.py @@ -4,6 +4,7 @@ import os import re from fnmatch import fnmatchcase +from glob import glob from pathlib import Path from typing import Iterator, List, Optional, Tuple @@ -109,9 +110,11 @@ def paths_matching_name_pattern( pattern = "*.*" if os.path.isdir(pattern): - for root, _, _ in os.walk(pattern): + for root, _, _ in os.walk(pattern, followlinks=True): for file in files: - matching_files.extend(Path(root).rglob(file)) + matching_files.extend( + map(Path, glob(f"{root}/**/{file}", recursive=True)) + ) elif os.path.isfile(pattern): matching_files.append(Path(pattern)) else: @@ -122,7 +125,9 @@ def paths_matching_name_pattern( all_matching_files: List[Path] = [] for file in files: - all_matching_files.extend(Path(sql_path).rglob(file)) + all_matching_files.extend( + map(Path, glob(f"{sql_path}/**/{file}", recursive=True)) + ) for query_file in all_matching_files: match = file_regex.match(str(query_file)) diff --git a/bigquery_etl/dependency.py b/bigquery_etl/dependency.py index 2762901f8a6..a554dec17e0 100644 --- a/bigquery_etl/dependency.py +++ b/bigquery_etl/dependency.py @@ -2,6 +2,7 @@ import re import sys +from glob import glob from itertools import groupby from pathlib import Path from subprocess import CalledProcessError @@ -145,7 +146,11 @@ def _get_references( file_paths = { path for parent in map(Path, paths or ["sql"]) - for path in (parent.glob("**/*.sql") if parent.is_dir() else [parent]) + for path in ( + map(Path, glob(f"{parent}/**/*.sql", recursive=True)) + if parent.is_dir() + else [parent] + ) if not path.name.endswith(".template.sql") # skip templates } fail = False diff --git a/bigquery_etl/docs/__init__.py b/bigquery_etl/docs/__init__.py index 772f52e4ca3..bbd27aae41d 100644 --- a/bigquery_etl/docs/__init__.py +++ b/bigquery_etl/docs/__init__.py @@ -101,7 +101,7 @@ def validate(project_dirs, log_level): if os.path.isdir(project_dir): parsed_routines = read_routine_dir(project_dir) - for root, dirs, files in os.walk(project_dir): + for root, dirs, files in os.walk(project_dir, followlinks=True): if os.path.basename(root) == EXAMPLE_DIR: sql_files = (f for f in files if os.path.splitext(f)[1] == ".sql") for file in sql_files: diff --git a/bigquery_etl/format_sql/format.py b/bigquery_etl/format_sql/format.py index 79fbf093ef4..7d918a1bf21 100644 --- a/bigquery_etl/format_sql/format.py +++ b/bigquery_etl/format_sql/format.py @@ -34,7 +34,7 @@ def format(paths, check=False): if os.path.isdir(path): sql_files.extend( filepath - for dirpath, _, filenames in os.walk(path) + for dirpath, _, filenames in os.walk(path, followlinks=True) for filename in filenames if filename.endswith(".sql") # skip tests/**/input.sql diff --git a/bigquery_etl/metadata/validate_metadata.py b/bigquery_etl/metadata/validate_metadata.py index 61bb08057c0..21469664eee 100644 --- a/bigquery_etl/metadata/validate_metadata.py +++ b/bigquery_etl/metadata/validate_metadata.py @@ -103,7 +103,7 @@ def validate(target): failed = False if os.path.isdir(target): - for root, dirs, files in os.walk(target): + for root, dirs, files in os.walk(target, followlinks=True): for file in files: if Metadata.is_metadata_file(file): path = os.path.join(root, file) @@ -134,7 +134,7 @@ def validate_datasets(target): failed = False if os.path.isdir(target): - for root, dirs, files in os.walk(target): + for root, dirs, files in os.walk(target, followlinks=True): for file in files: if DatasetMetadata.is_dataset_metadata_file(file): path = os.path.join(root, file) diff --git a/bigquery_etl/query_scheduling/generate_airflow_dags.py b/bigquery_etl/query_scheduling/generate_airflow_dags.py index 0327931cb0a..7b7a2c6b8f4 100644 --- a/bigquery_etl/query_scheduling/generate_airflow_dags.py +++ b/bigquery_etl/query_scheduling/generate_airflow_dags.py @@ -56,7 +56,7 @@ def get_dags(project_id, dags_config, sql_dir=None): for project_dir in project_dirs(project_id, sql_dir=sql_dir): # parse metadata.yaml to retrieve scheduling information if os.path.isdir(project_dir): - for root, dirs, files in os.walk(project_dir): + for root, dirs, files in os.walk(project_dir, followlinks=True): try: if QUERY_FILE in files: query_file = os.path.join(root, QUERY_FILE) diff --git a/bigquery_etl/routine/parse_routine.py b/bigquery_etl/routine/parse_routine.py index c0c1423ac68..0e530e7c4c2 100644 --- a/bigquery_etl/routine/parse_routine.py +++ b/bigquery_etl/routine/parse_routine.py @@ -45,7 +45,7 @@ def get_routines_from_dir(project_dir): "project": root.split("/")[-3], "is_udf": filename == UDF_FILE, } - for root, dirs, files in os.walk(project_dir) + for root, dirs, files in os.walk(project_dir, followlinks=True) for filename in files if filename in ROUTINE_FILES ] @@ -234,7 +234,7 @@ def read_routine_dir(*project_dirs): raw_routines[project_dirs] = { raw_routine.name: raw_routine for project_dir in project_dirs - for root, dirs, files in os.walk(project_dir) + for root, dirs, files in os.walk(project_dir, followlinks=True) if os.path.basename(root) != ConfigLoader.get("routine", "example_dir") for filename in files if filename in ROUTINE_FILES diff --git a/bigquery_etl/routine/publish_routines.py b/bigquery_etl/routine/publish_routines.py index c6464d4a69f..d80d079f28f 100644 --- a/bigquery_etl/routine/publish_routines.py +++ b/bigquery_etl/routine/publish_routines.py @@ -221,7 +221,7 @@ def push_dependencies_to_gcs(bucket, path, dependency_dir, project_id): client = storage.Client(project_id) bucket = client.get_bucket(bucket) - for root, dirs, files in os.walk(dependency_dir): + for root, dirs, files in os.walk(dependency_dir, followlinks=True): for filename in files: blob = bucket.blob(path + filename) blob.upload_from_filename(os.path.join(root, filename)) diff --git a/sql/moz-fx-data-shared-prod/monitoring_derived/bigquery_etl_scheduled_queries_cost_v1/query.py b/sql/moz-fx-data-shared-prod/monitoring_derived/bigquery_etl_scheduled_queries_cost_v1/query.py index 5aa52d07f10..ee584bb46a8 100644 --- a/sql/moz-fx-data-shared-prod/monitoring_derived/bigquery_etl_scheduled_queries_cost_v1/query.py +++ b/sql/moz-fx-data-shared-prod/monitoring_derived/bigquery_etl_scheduled_queries_cost_v1/query.py @@ -4,6 +4,7 @@ from argparse import ArgumentParser from fnmatch import fnmatchcase +from glob import glob from pathlib import Path from google.cloud import bigquery @@ -43,9 +44,13 @@ def main(): args = parser.parse_args() client = bigquery.Client(args.project) - sql_queries = list(Path(args.sql_dir).rglob("query.sql")) - python_queries = list(Path(args.sql_dir).rglob("query.py")) - multipart_queries = list(Path(args.sql_dir).rglob("part1.sql")) + sql_queries = list(map(Path, glob(f"{args.sql_dir}/**/query.sql", recursive=True))) + python_queries = list( + map(Path, glob(f"{args.sql_dir}/**/query.py", recursive=True)) + ) + multipart_queries = list( + map(Path, glob(f"{args.sql_dir}/**/part1.sql", recursive=True)) + ) query_paths = sql_queries + python_queries + multipart_queries query = create_query(query_paths, args.date) diff --git a/sql/moz-fx-data-shared-prod/monitoring_derived/bigquery_etl_scheduled_query_usage_v1/query.py b/sql/moz-fx-data-shared-prod/monitoring_derived/bigquery_etl_scheduled_query_usage_v1/query.py index b479f372773..50dfd784e57 100755 --- a/sql/moz-fx-data-shared-prod/monitoring_derived/bigquery_etl_scheduled_query_usage_v1/query.py +++ b/sql/moz-fx-data-shared-prod/monitoring_derived/bigquery_etl_scheduled_query_usage_v1/query.py @@ -3,6 +3,7 @@ """Determine cost of previously scheduled bigquery-etl queries.""" from argparse import ArgumentParser +from glob import glob from pathlib import Path from google.cloud import bigquery @@ -57,9 +58,13 @@ def create_query(query_paths, date, project): def main(): args = parser.parse_args() - sql_queries = list(Path(args.sql_dir).rglob("query.sql")) - python_queries = list(Path(args.sql_dir).rglob("query.py")) - multipart_queries = list(Path(args.sql_dir).rglob("part1.sql")) + sql_queries = list(map(Path, glob(f"{args.sql_dir}/**/query.sql", recursive=True))) + python_queries = list( + map(Path, glob(f"{args.sql_dir}/**/query.py", recursive=True)) + ) + multipart_queries = list( + map(Path, glob(f"{args.sql_dir}/**/part1.sql", recursive=True)) + ) query_paths = sql_queries + python_queries + multipart_queries partition = args.date.replace("-", "") destination_table = f"{args.project}.{args.destination_dataset}.{args.destination_table}${partition}" diff --git a/sql_generators/events_daily/__init__.py b/sql_generators/events_daily/__init__.py index 320b2c6453d..b1d21adfef9 100755 --- a/sql_generators/events_daily/__init__.py +++ b/sql_generators/events_daily/__init__.py @@ -113,7 +113,7 @@ def get_args(self) -> dict: def get_query_dirs(path): """Walk a path to get all templated query dirs.""" - for directory, sub_dirs, files in os.walk(path): + for directory, sub_dirs, files in os.walk(path, followlinks=True): non_hidden = {f for f in files if not f.startswith(".")} if non_hidden and non_hidden.issubset(ALLOWED_FILES): dir_path = Path(directory) diff --git a/tests/sql/glam-fenix-dev/glam_etl/bootstrap.py b/tests/sql/glam-fenix-dev/glam_etl/bootstrap.py index 4315728658e..69926f06fff 100644 --- a/tests/sql/glam-fenix-dev/glam_etl/bootstrap.py +++ b/tests/sql/glam-fenix-dev/glam_etl/bootstrap.py @@ -8,6 +8,7 @@ import shutil import warnings from collections import namedtuple +from glob import glob from multiprocessing import Pool from pathlib import Path @@ -88,7 +89,11 @@ def deps(output): """Create a dependency file with all links between queries and tables.""" path = Path(output) deps = calculate_dependencies( - [p for p in SQL_ROOT.glob("**/*.sql") if "__clients_daily" not in p.name] + [ + p + for p in map(Path, glob(f"{SQL_ROOT}/**/*.sql", recursive=True)) + if "__clients_daily" not in p.name + ] ) path.write_text( json.dumps([dict(zip(["from", "to"], dep)) for dep in deps], indent=2)