diff --git a/case_utils/case_sparql_select/__init__.py b/case_utils/case_sparql_select/__init__.py index eaa98cb..2d8581c 100644 --- a/case_utils/case_sparql_select/__init__.py +++ b/case_utils/case_sparql_select/__init__.py @@ -49,74 +49,44 @@ _logger = logging.getLogger(os.path.basename(__file__)) -def main() -> None: - parser = argparse.ArgumentParser() - - # Configure debug logging before running parse_args, because there could be an error raised before the construction of the argument parser. - logging.basicConfig( - level=logging.DEBUG - if ("--debug" in sys.argv or "-d" in sys.argv) - else logging.INFO - ) - - parser.add_argument("-d", "--debug", action="store_true") - parser.add_argument( - "--built-version", - choices=tuple(built_version_choices_list), - default="case-" + CURRENT_CASE_VERSION, - help="Ontology version to use to supplement query, such as for subclass querying. Does not require networking to use. Default is most recent CASE release. Passing 'none' will mean no pre-built CASE ontology versions accompanying this tool will be included in the analysis.", - ) - parser.add_argument( - "--disallow-empty-results", - action="store_true", - help="Raise error if no results are returned for query.", - ) - parser.add_argument( - "--use-prefixes", - action="store_true", - help="Abbreviate node IDs according to graph's encoded prefixes. (This will use prefixes in the graph, not the query.)", - ) - parser.add_argument( - "out_table", - help="Expected extensions are .html for HTML tables, .md for Markdown tables, .csv for comma-separated values, and .tsv for tab-separated values.", - ) - parser.add_argument( - "in_sparql", - help="File containing a SPARQL SELECT query. Note that prefixes not mapped with a PREFIX statement will be mapped according to their first occurrence among input graphs.", - ) - parser.add_argument("in_graph", nargs="+") - args = parser.parse_args() +def query_text_to_variables(select_query_text: str) -> typing.List[str]: + # Build columns list from SELECT line. + select_query_text_lines = select_query_text.split("\n") + select_line = [ + line for line in select_query_text_lines if line.startswith("SELECT ") + ][0] + variables = select_line.replace(" DISTINCT", "").replace("SELECT ", "").split(" ") + return variables - graph = rdflib.Graph() - for in_graph_filename in args.in_graph: - graph.parse(in_graph_filename) +def graph_and_query_to_data_frame( + graph: rdflib.Graph, + select_query_text: str, + *args: typing.Any, + built_version: str = "case-" + CURRENT_CASE_VERSION, + disallow_empty_results: bool = False, + use_prefixes: bool = False, + **kwargs: typing.Any, +) -> pd.DataFrame: # Inherit prefixes defined in input context dictionary. nsdict = {k: v for (k, v) in graph.namespace_manager.namespaces()} - select_query_text = None - with open(args.in_sparql, "r") as in_fh: - select_query_text = in_fh.read().strip() - _logger.debug("select_query_text = %r." % select_query_text) - + # Avoid side-effects on input parameter. if "subClassOf" in select_query_text: - case_utils.ontology.load_subclass_hierarchy( - graph, built_version=args.built_version - ) + _graph = rdflib.Graph() + _graph += graph + case_utils.ontology.load_subclass_hierarchy(_graph, built_version=built_version) + else: + _graph = graph - # Build columns list from SELECT line. - select_query_text_lines = select_query_text.split("\n") - select_line = [ - line for line in select_query_text_lines if line.startswith("SELECT ") - ][0] - variables = select_line.replace(" DISTINCT", "").replace("SELECT ", "").split(" ") + variables = query_text_to_variables(select_query_text) tally = 0 records = [] select_query_object = rdflib.plugins.sparql.processor.prepareQuery( select_query_text, initNs=nsdict ) - for (row_no, row) in enumerate(graph.query(select_query_object)): + for (row_no, row) in enumerate(_graph.query(select_query_object)): tally = row_no + 1 record = [] for (column_no, column) in enumerate(row): @@ -131,7 +101,7 @@ def main() -> None: # .decode() is because hexlify returns bytes. column_value = binascii.hexlify(column.toPython()).decode() elif isinstance(column, rdflib.URIRef): - if args.use_prefixes: + if use_prefixes: column_value = graph.namespace_manager.qname(column.toPython()) else: column_value = column.toPython() @@ -141,39 +111,192 @@ def main() -> None: _logger.debug("row[0]column[%d] = %r." % (column_no, column_value)) record.append(column_value) records.append(record) + if tally == 0: - if args.disallow_empty_results: + if disallow_empty_results: raise ValueError("Failed to return any results.") df = pd.DataFrame(records, columns=variables) + return df + +def data_frame_to_table_text( + df: pd.DataFrame, + *args: typing.Any, + output_mode: str, + use_header: bool, + use_index: bool, + **kwargs: typing.Any, +) -> str: table_text: typing.Optional[str] = None - if args.out_table.endswith(".csv") or args.out_table.endswith(".tsv"): - # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html + + # Set up kwargs dicts. One kwarg behaves slightly differently for Markdown vs. other formats. + general_kwargs: typing.Dict[str, typing.Any] = dict() + md_kwargs: typing.Dict[str, typing.Any] = dict() + + # Note some output modes will drop 'header' from general_kwargs, due to alternate support or lack of support. + if use_header: + general_kwargs["header"] = True + else: + general_kwargs["header"] = False + md_kwargs["headers"] = tuple() + + general_kwargs["index"] = use_index + + if output_mode in {"csv", "tsv"}: sep: str - if args.out_table.endswith(".csv"): + if output_mode == "csv": sep = "," - elif args.out_table.endswith(".tsv"): + elif output_mode == "tsv": sep = "\t" else: raise NotImplementedError( "Output extension not implemented in CSV-style output." ) - table_text = df.to_csv(sep=sep) - elif args.out_table.endswith(".html"): + table_text = df.to_csv(sep=sep, **general_kwargs) + elif output_mode == "html": # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_html.html # Add CSS classes for CASE website Bootstrap support. - table_text = df.to_html(classes=("table", "table-bordered", "table-condensed")) - elif args.out_table.endswith(".md"): + table_text = df.to_html( + classes=("table", "table-bordered", "table-condensed"), **general_kwargs + ) + elif output_mode == "md": # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_markdown.html # https://pypi.org/project/tabulate/ # Assume Github-flavored Markdown. - table_text = df.to_markdown(tablefmt="github") - if table_text is None: - raise NotImplementedError( - "Unsupported output extension for output filename %r.", args.out_table - ) + # Drop unsupported kwarg. + del general_kwargs["header"] + + table_text = df.to_markdown(tablefmt="github", **general_kwargs, **md_kwargs) + else: + if table_text is None: + raise NotImplementedError("Unimplemented output mode: %r." % output_mode) + assert table_text is not None + + return table_text + + +def main() -> None: + parser = argparse.ArgumentParser() + + # Configure debug logging before running parse_args, because there could be an error raised before the construction of the argument parser. + logging.basicConfig( + level=logging.DEBUG + if ("--debug" in sys.argv or "-d" in sys.argv) + else logging.INFO + ) + + parser.add_argument("-d", "--debug", action="store_true") + parser.add_argument( + "--built-version", + choices=tuple(built_version_choices_list), + default="case-" + CURRENT_CASE_VERSION, + help="Ontology version to use to supplement query, such as for subclass querying. Does not require networking to use. Default is most recent CASE release. Passing 'none' will mean no pre-built CASE ontology versions accompanying this tool will be included in the analysis.", + ) + parser.add_argument( + "--disallow-empty-results", + action="store_true", + help="Raise error if no results are returned for query.", + ) + parser.add_argument( + "--use-prefixes", + action="store_true", + help="Abbreviate node IDs according to graph's encoded prefixes. (This will use prefixes in the graph, not the query.)", + ) + parser.add_argument( + "out_table", + help="Expected extensions are .html for HTML tables, .md for Markdown tables, .csv for comma-separated values, and .tsv for tab-separated values.", + ) + parser.add_argument( + "in_sparql", + help="File containing a SPARQL SELECT query. Note that prefixes not mapped with a PREFIX statement will be mapped according to their first occurrence among input graphs.", + ) + + parser_header_group = parser.add_mutually_exclusive_group(required=False) + parser_header_group.add_argument( + "--header", + action="store_true", + help="Print column labels. This is the default behavior.", + ) + parser_header_group.add_argument( + "--no-header", + action="store_true", + help="Do not print column labels.", + ) + + parser_index_group = parser.add_mutually_exclusive_group(required=False) + parser_index_group.add_argument( + "--index", + action="store_true", + help="Print index (auto-incrementing row labels as left untitled column). This is the default behavior.", + ) + parser_index_group.add_argument( + "--no-index", + action="store_true", + help="Do not print index.", + ) + + parser.add_argument("in_graph", nargs="+") + args = parser.parse_args() + + output_mode: str + if args.out_table.endswith(".csv"): + output_mode = "csv" + elif args.out_table.endswith(".html"): + output_mode = "html" + elif args.out_table.endswith(".json"): + output_mode = "json" + elif args.out_table.endswith(".md"): + output_mode = "md" + elif args.out_table.endswith(".tsv"): + output_mode = "tsv" + else: + raise NotImplementedError("Output file extension not implemented.") + + graph = rdflib.Graph() + for in_graph_filename in args.in_graph: + graph.parse(in_graph_filename) + + select_query_text: typing.Optional[str] = None + with open(args.in_sparql, "r") as in_fh: + select_query_text = in_fh.read().strip() + if select_query_text is None: + raise ValueError("Failed to load query.") + _logger.debug("select_query_text = %r." % select_query_text) + + # Process --header and --no-header. + use_header: bool + if args.header is True: + use_header = True + if args.no_header is True: + use_header = False + else: + use_header = True + + # Process --index and --no-index. + use_index: bool + if args.index is True: + use_index = True + if args.no_index is True: + use_index = False + else: + use_index = True + + df = graph_and_query_to_data_frame( + graph, + select_query_text, + built_version=args.built_version, + disallow_empty_results=args.disallow_empty_results is True, + use_prefixes=args.use_prefixes is True, + ) + + table_text = data_frame_to_table_text( + df, + output_mode=output_mode, + use_header=use_header, + use_index=use_index, + ) with open(args.out_table, "w") as out_fh: out_fh.write(table_text) if table_text[-1] != "\n": diff --git a/tests/case_utils/Makefile b/tests/case_utils/Makefile index e77c927..3c65a40 100644 --- a/tests/case_utils/Makefile +++ b/tests/case_utils/Makefile @@ -65,6 +65,7 @@ check: \ && pytest \ --ignore case_file \ --ignore case_sparql_construct \ + --ignore case_sparql_select \ --ignore case_validate \ --log-level=DEBUG diff --git a/tests/case_utils/case_sparql_select/.check-w3-output-with_header-with_index.csv b/tests/case_utils/case_sparql_select/.check-w3-output-with_header-with_index.csv new file mode 100644 index 0000000..063e950 --- /dev/null +++ b/tests/case_utils/case_sparql_select/.check-w3-output-with_header-with_index.csv @@ -0,0 +1,3 @@ +,?name,?mbox +0,Johnny Lee Outlaw,mailto:jlow@example.com +1,Peter Goodguy,mailto:peter@example.org diff --git a/tests/case_utils/case_sparql_select/.check-w3-output-with_header-with_index.html b/tests/case_utils/case_sparql_select/.check-w3-output-with_header-with_index.html new file mode 100644 index 0000000..bee5944 --- /dev/null +++ b/tests/case_utils/case_sparql_select/.check-w3-output-with_header-with_index.html @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + + + + + + +
?name?mbox
0Johnny Lee Outlawmailto:jlow@example.com
1Peter Goodguymailto:peter@example.org
diff --git a/tests/case_utils/case_sparql_select/.check-w3-output-with_header-with_index.md b/tests/case_utils/case_sparql_select/.check-w3-output-with_header-with_index.md new file mode 100644 index 0000000..af68b84 --- /dev/null +++ b/tests/case_utils/case_sparql_select/.check-w3-output-with_header-with_index.md @@ -0,0 +1,4 @@ +| | ?name | ?mbox | +|----|-------------------|--------------------------| +| 0 | Johnny Lee Outlaw | mailto:jlow@example.com | +| 1 | Peter Goodguy | mailto:peter@example.org | diff --git a/tests/case_utils/case_sparql_select/.check-w3-output-with_header-with_index.tsv b/tests/case_utils/case_sparql_select/.check-w3-output-with_header-with_index.tsv new file mode 100644 index 0000000..a4fdfca --- /dev/null +++ b/tests/case_utils/case_sparql_select/.check-w3-output-with_header-with_index.tsv @@ -0,0 +1,3 @@ + ?name ?mbox +0 Johnny Lee Outlaw mailto:jlow@example.com +1 Peter Goodguy mailto:peter@example.org diff --git a/tests/case_utils/case_sparql_select/.check-w3-output-with_header-without_index.csv b/tests/case_utils/case_sparql_select/.check-w3-output-with_header-without_index.csv new file mode 100644 index 0000000..6bd60fb --- /dev/null +++ b/tests/case_utils/case_sparql_select/.check-w3-output-with_header-without_index.csv @@ -0,0 +1,3 @@ +?name,?mbox +Johnny Lee Outlaw,mailto:jlow@example.com +Peter Goodguy,mailto:peter@example.org diff --git a/tests/case_utils/case_sparql_select/.check-w3-output-with_header-without_index.html b/tests/case_utils/case_sparql_select/.check-w3-output-with_header-without_index.html new file mode 100644 index 0000000..041fd3b --- /dev/null +++ b/tests/case_utils/case_sparql_select/.check-w3-output-with_header-without_index.html @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + +
?name?mbox
Johnny Lee Outlawmailto:jlow@example.com
Peter Goodguymailto:peter@example.org
diff --git a/tests/case_utils/case_sparql_select/.check-w3-output-with_header-without_index.md b/tests/case_utils/case_sparql_select/.check-w3-output-with_header-without_index.md new file mode 100644 index 0000000..3aa8a01 --- /dev/null +++ b/tests/case_utils/case_sparql_select/.check-w3-output-with_header-without_index.md @@ -0,0 +1,4 @@ +| ?name | ?mbox | +|-------------------|--------------------------| +| Johnny Lee Outlaw | mailto:jlow@example.com | +| Peter Goodguy | mailto:peter@example.org | diff --git a/tests/case_utils/case_sparql_select/.check-w3-output-with_header-without_index.tsv b/tests/case_utils/case_sparql_select/.check-w3-output-with_header-without_index.tsv new file mode 100644 index 0000000..dd1e81d --- /dev/null +++ b/tests/case_utils/case_sparql_select/.check-w3-output-with_header-without_index.tsv @@ -0,0 +1,3 @@ +?name ?mbox +Johnny Lee Outlaw mailto:jlow@example.com +Peter Goodguy mailto:peter@example.org diff --git a/tests/case_utils/case_sparql_select/.check-w3-output-without_header-with_index.csv b/tests/case_utils/case_sparql_select/.check-w3-output-without_header-with_index.csv new file mode 100644 index 0000000..7933d39 --- /dev/null +++ b/tests/case_utils/case_sparql_select/.check-w3-output-without_header-with_index.csv @@ -0,0 +1,2 @@ +0,Johnny Lee Outlaw,mailto:jlow@example.com +1,Peter Goodguy,mailto:peter@example.org diff --git a/tests/case_utils/case_sparql_select/.check-w3-output-without_header-with_index.html b/tests/case_utils/case_sparql_select/.check-w3-output-without_header-with_index.html new file mode 100644 index 0000000..b6a842b --- /dev/null +++ b/tests/case_utils/case_sparql_select/.check-w3-output-without_header-with_index.html @@ -0,0 +1,14 @@ + + + + + + + + + + + + + +
0Johnny Lee Outlawmailto:jlow@example.com
1Peter Goodguymailto:peter@example.org
diff --git a/tests/case_utils/case_sparql_select/.check-w3-output-without_header-with_index.md b/tests/case_utils/case_sparql_select/.check-w3-output-without_header-with_index.md new file mode 100644 index 0000000..c5ee8c8 --- /dev/null +++ b/tests/case_utils/case_sparql_select/.check-w3-output-without_header-with_index.md @@ -0,0 +1,3 @@ +|---|-------------------|--------------------------| +| 0 | Johnny Lee Outlaw | mailto:jlow@example.com | +| 1 | Peter Goodguy | mailto:peter@example.org | diff --git a/tests/case_utils/case_sparql_select/.check-w3-output-without_header-with_index.tsv b/tests/case_utils/case_sparql_select/.check-w3-output-without_header-with_index.tsv new file mode 100644 index 0000000..992efe2 --- /dev/null +++ b/tests/case_utils/case_sparql_select/.check-w3-output-without_header-with_index.tsv @@ -0,0 +1,2 @@ +0 Johnny Lee Outlaw mailto:jlow@example.com +1 Peter Goodguy mailto:peter@example.org diff --git a/tests/case_utils/case_sparql_select/.check-w3-output-without_header-without_index.csv b/tests/case_utils/case_sparql_select/.check-w3-output-without_header-without_index.csv new file mode 100644 index 0000000..a4c2c82 --- /dev/null +++ b/tests/case_utils/case_sparql_select/.check-w3-output-without_header-without_index.csv @@ -0,0 +1,2 @@ +Johnny Lee Outlaw,mailto:jlow@example.com +Peter Goodguy,mailto:peter@example.org diff --git a/tests/case_utils/case_sparql_select/.check-w3-output-without_header-without_index.html b/tests/case_utils/case_sparql_select/.check-w3-output-without_header-without_index.html new file mode 100644 index 0000000..6dbc7c3 --- /dev/null +++ b/tests/case_utils/case_sparql_select/.check-w3-output-without_header-without_index.html @@ -0,0 +1,12 @@ + + + + + + + + + + + +
Johnny Lee Outlawmailto:jlow@example.com
Peter Goodguymailto:peter@example.org
diff --git a/tests/case_utils/case_sparql_select/.check-w3-output-without_header-without_index.md b/tests/case_utils/case_sparql_select/.check-w3-output-without_header-without_index.md new file mode 100644 index 0000000..6ad505c --- /dev/null +++ b/tests/case_utils/case_sparql_select/.check-w3-output-without_header-without_index.md @@ -0,0 +1,3 @@ +|-------------------|--------------------------| +| Johnny Lee Outlaw | mailto:jlow@example.com | +| Peter Goodguy | mailto:peter@example.org | diff --git a/tests/case_utils/case_sparql_select/.check-w3-output-without_header-without_index.tsv b/tests/case_utils/case_sparql_select/.check-w3-output-without_header-without_index.tsv new file mode 100644 index 0000000..833da47 --- /dev/null +++ b/tests/case_utils/case_sparql_select/.check-w3-output-without_header-without_index.tsv @@ -0,0 +1,2 @@ +Johnny Lee Outlaw mailto:jlow@example.com +Peter Goodguy mailto:peter@example.org diff --git a/tests/case_utils/case_sparql_select/Makefile b/tests/case_utils/case_sparql_select/Makefile index 68f11ec..0523c8a 100644 --- a/tests/case_utils/case_sparql_select/Makefile +++ b/tests/case_utils/case_sparql_select/Makefile @@ -50,6 +50,9 @@ check: \ check-w3-tsv \ check-prefixed_results \ check-subclass + source $(tests_srcdir)/venv/bin/activate \ + && pytest \ + --log-level=DEBUG check-prefixed_results: \ check-prefixed_results-csv \ diff --git a/tests/case_utils/case_sparql_select/test_data_frame_to_table_text_json.py b/tests/case_utils/case_sparql_select/test_data_frame_to_table_text_json.py new file mode 100644 index 0000000..00d20c3 --- /dev/null +++ b/tests/case_utils/case_sparql_select/test_data_frame_to_table_text_json.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 + +# This software was developed at the National Institute of Standards +# and Technology by employees of the Federal Government in the course +# of their official duties. Pursuant to title 17 Section 105 of the +# United States Code this software is not subject to copyright +# protection and is in the public domain. NIST assumes no +# responsibility whatsoever for its use by other parties, and makes +# no guarantees, expressed or implied, about its quality, +# reliability, or any other characteristic. +# +# We would appreciate acknowledgement if the software is used. + +import pathlib +import typing + +import pytest +import rdflib + +import case_utils.case_sparql_select + +SRCDIR = pathlib.Path(__file__).parent + +GRAPH = rdflib.Graph() +GRAPH.parse(str(SRCDIR / "w3-input-2.ttl")) +GRAPH.parse(str(SRCDIR / "w3-input-3.json")) +assert len(GRAPH) > 0 + +SELECT_QUERY_TEXT: typing.Optional[str] = None +with (SRCDIR / "w3-input-1.sparql").open("r") as _fh: + SELECT_QUERY_TEXT = _fh.read().strip() +assert SELECT_QUERY_TEXT is not None + +DATA_FRAME = case_utils.case_sparql_select.graph_and_query_to_data_frame( + GRAPH, SELECT_QUERY_TEXT +) + + +def make_data_frame_to_json_table_text_parameters() -> typing.Iterator[ + typing.Tuple[str, bool, bool] +]: + for use_header in [False, True]: + for use_index in [False, True]: + for output_mode in ["csv", "html", "md", "tsv"]: + yield (output_mode, use_header, use_index) + + +@pytest.mark.parametrize( + "output_mode, use_header, use_index", + make_data_frame_to_json_table_text_parameters(), +) +def test_data_frame_to_table_text_json( + output_mode: str, + use_header: bool, + use_index: bool, +) -> None: + table_text = case_utils.case_sparql_select.data_frame_to_table_text( + DATA_FRAME, + output_mode=output_mode, + use_header=use_header, + use_index=use_index, + ) + + output_filename_template = ".check-w3-output-%s_header-%s_index.%s" + header_part = "with" if use_header else "without" + index_part = "with" if use_index else "without" + output_filename = output_filename_template % ( + header_part, + index_part, + output_mode, + ) + with (SRCDIR / output_filename).open("w") as out_fh: + out_fh.write(table_text) + if table_text[-1] != "\n": + out_fh.write("\n")