diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 56aee38bfe..a35e898c2d 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -27,7 +27,7 @@ unix_millis, unix_seconds, ) -from bigframes.bigquery._operations.geo import st_area +from bigframes.bigquery._operations.geo import st_area, st_difference from bigframes.bigquery._operations.json import ( json_extract, json_extract_array, @@ -48,6 +48,7 @@ "array_to_string", # geo ops "st_area", + "st_difference", # json ops "json_set", "json_extract", diff --git a/bigframes/bigquery/_operations/geo.py b/bigframes/bigquery/_operations/geo.py index 7b8e47e2da..a41c33f67d 100644 --- a/bigframes/bigquery/_operations/geo.py +++ b/bigframes/bigquery/_operations/geo.py @@ -15,6 +15,7 @@ from __future__ import annotations from bigframes import operations as ops +import bigframes.dtypes import bigframes.geopandas import bigframes.series @@ -91,3 +92,122 @@ def st_area(series: bigframes.series.Series) -> bigframes.series.Series: series = series._apply_unary_op(ops.geo_area_op) series.name = None return series + + +def st_difference( + series: bigframes.series.Series, other: bigframes.series.Series +) -> bigframes.series.Series: + """ + Returns a GEOGRAPHY that represents the point set difference of + `geography_1` and `geography_2`. Therefore, the result consists of the part + of `geography_1` that doesn't intersect with `geography_2`. + + If `geometry_1` is completely contained in `geometry_2`, then ST_DIFFERENCE + returns an empty GEOGRAPHY. + + ..note:: + BigQuery's Geography functions, like `st_difference`, interpret the geometry + data type as a point set on the Earth's surface. A point set is a set + of points, lines, and polygons on the WGS84 reference spheroid, with + geodesic edges. See: https://cloud.google.com/bigquery/docs/geospatial-data + + **Examples:** + + >>> import bigframes as bpd + >>> import bigframes.bigquery as bbq + >>> import bigframes.geopandas + >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None + + We can check two GeoSeries against each other, row by row. + + >>> s1 = bigframes.geopandas.GeoSeries( + ... [ + ... Polygon([(0, 0), (2, 2), (0, 2)]), + ... Polygon([(0, 0), (2, 2), (0, 2)]), + ... LineString([(0, 0), (2, 2)]), + ... LineString([(2, 0), (0, 2)]), + ... Point(0, 1), + ... ], + ... ) + >>> s2 = bigframes.geopandas.GeoSeries( + ... [ + ... Polygon([(0, 0), (1, 1), (0, 1)]), + ... LineString([(1, 0), (1, 3)]), + ... LineString([(2, 0), (0, 2)]), + ... Point(1, 1), + ... Point(0, 1), + ... ], + ... index=range(1, 6), + ... ) + + >>> s1 + 0 POLYGON ((0 0, 2 2, 0 2, 0 0)) + 1 POLYGON ((0 0, 2 2, 0 2, 0 0)) + 2 LINESTRING (0 0, 2 2) + 3 LINESTRING (2 0, 0 2) + 4 POINT (0 1) + dtype: geometry + + >>> s2 + 1 POLYGON ((0 0, 1 1, 0 1, 0 0)) + 2 LINESTRING (1 0, 1 3) + 3 LINESTRING (2 0, 0 2) + 4 POINT (1 1) + 5 POINT (0 1) + dtype: geometry + + >>> bbq.st_difference(s1, s2) + 0 None + 1 POLYGON ((0.99954 1, 2 2, 0 2, 0 1, 0.99954 1)) + 2 LINESTRING (0 0, 1 1.00046, 2 2) + 3 GEOMETRYCOLLECTION EMPTY + 4 POINT (0 1) + 5 None + dtype: geometry + + We can also check difference of single shapely geometries: + + >>> sbq1 = bigframes.geopandas.GeoSeries( + ... [ + ... Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]) + ... ] + ... ) + >>> sbq2 = bigframes.geopandas.GeoSeries( + ... [ + ... Polygon([(4, 2), (6, 2), (8, 6), (4, 2)]) + ... ] + ... ) + + >>> sbq1 + 0 POLYGON ((0 0, 10 0, 10 10, 0 0)) + dtype: geometry + + >>> sbq2 + 0 POLYGON ((4 2, 6 2, 8 6, 4 2)) + dtype: geometry + + >>> bbq.st_difference(sbq1, sbq2) + 0 POLYGON ((0 0, 10 0, 10 10, 0 0), (8 6, 6 2, 4... + dtype: geometry + + Additionally, we can check difference of a GeoSeries against a single shapely geometry: + + >>> bbq.st_difference(s1, sbq2) + 0 POLYGON ((0 0, 2 2, 0 2, 0 0)) + 1 None + 2 None + 3 None + 4 None + dtype: geometry + + Args: + other (bigframes.series.Series or geometric object): + The GeoSeries (elementwise) or geometric object to find the difference to. + + Returns: + bigframes.series.Series: + A GeoSeries of the points in each aligned geometry that are not + in other. + """ + return series._apply_binary_op(other, ops.geo_st_difference_op) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 35a307722f..ce0cd6c37a 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1001,11 +1001,6 @@ def normalize_op_impl(x: ibis_types.Value): # Geo Ops -@scalar_op_compiler.register_unary_op(ops.geo_st_boundary_op, pass_op=False) -def geo_st_boundary_op_impl(x: ibis_types.Value): - return st_boundary(x) - - @scalar_op_compiler.register_unary_op(ops.geo_area_op) def geo_area_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.GeoSpatialValue, x).area() @@ -1016,6 +1011,18 @@ def geo_st_astext_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.GeoSpatialValue, x).as_text() +@scalar_op_compiler.register_unary_op(ops.geo_st_boundary_op, pass_op=False) +def geo_st_boundary_op_impl(x: ibis_types.Value): + return st_boundary(x) + + +@scalar_op_compiler.register_binary_op(ops.geo_st_difference_op, pass_op=False) +def geo_st_difference_op_impl(x: ibis_types.Value, y: ibis_types.Value): + return typing.cast(ibis_types.GeoSpatialValue, x).difference( + typing.cast(ibis_types.GeoSpatialValue, y) + ) + + @scalar_op_compiler.register_unary_op(ops.geo_st_geogfromtext_op) def geo_st_geogfromtext_op_impl(x: ibis_types.Value): # Ibis doesn't seem to provide a dedicated method to cast from string to geography, diff --git a/bigframes/geopandas/geoseries.py b/bigframes/geopandas/geoseries.py index 44018b8c5c..6c9cb77a08 100644 --- a/bigframes/geopandas/geoseries.py +++ b/bigframes/geopandas/geoseries.py @@ -62,7 +62,7 @@ def area(self, crs=None) -> bigframes.series.Series: # type: ignore Raises: NotImplementedError: - GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), insetead. + GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. """ raise NotImplementedError( f"GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. {constants.FEEDBACK_LINK}" @@ -93,3 +93,6 @@ def to_wkt(self: GeoSeries) -> bigframes.series.Series: series = self._apply_unary_op(ops.geo_st_astext_op) series.name = None return series + + def difference(self: GeoSeries, other: GeoSeries) -> bigframes.series.Series: # type: ignore + return self._apply_binary_op(other, ops.geo_st_difference_op) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 83cefbe6ba..2b4c9ca892 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -90,6 +90,7 @@ geo_area_op, geo_st_astext_op, geo_st_boundary_op, + geo_st_difference_op, geo_st_geogfromtext_op, geo_st_geogpoint_op, geo_x_op, @@ -366,6 +367,7 @@ # Geo ops "geo_area_op", "geo_st_boundary_op", + "geo_st_difference_op", "geo_st_astext_op", "geo_st_geogfromtext_op", "geo_st_geogpoint_op", diff --git a/bigframes/operations/geo_ops.py b/bigframes/operations/geo_ops.py index 9ef0983e24..3cf248bddb 100644 --- a/bigframes/operations/geo_ops.py +++ b/bigframes/operations/geo_ops.py @@ -37,6 +37,10 @@ ), ) +geo_st_difference_op = base_ops.create_binary_op( + name="geo_st_difference", type_signature=op_typing.BinaryGeo() +) + geo_st_geogfromtext_op = base_ops.create_unary_op( name="geo_st_geogfromtext", type_signature=op_typing.FixedOutputType( @@ -44,7 +48,6 @@ ), ) - geo_st_geogpoint_op = base_ops.create_binary_op( name="geo_st_geogpoint", type_signature=op_typing.BinaryNumericGeo() ) diff --git a/bigframes/operations/type.py b/bigframes/operations/type.py index 0a47cd91f0..b4029d74c7 100644 --- a/bigframes/operations/type.py +++ b/bigframes/operations/type.py @@ -122,6 +122,20 @@ def output_type( @dataclasses.dataclass +@dataclasses.dataclass +class BinaryGeo(BinaryTypeSignature): + """Type signature for geo functions like difference that can map geo to geo.""" + + def output_type( + self, left_type: ExpressionType, right_type: ExpressionType + ) -> ExpressionType: + if (left_type is not None) and not bigframes.dtypes.is_geo_like(left_type): + raise TypeError(f"Type {left_type} is not geo") + if (right_type is not None) and not bigframes.dtypes.is_geo_like(right_type): + raise TypeError(f"Type {right_type} is not numeric") + return bigframes.dtypes.GEO_DTYPE + + class BinaryNumericGeo(BinaryTypeSignature): """Type signature for geo functions like from_xy that can map ints to ints.""" diff --git a/notebooks/geo/geoseries.ipynb b/notebooks/geo/geoseries.ipynb index 7060128bf6..e7566f3fa6 100644 --- a/notebooks/geo/geoseries.ipynb +++ b/notebooks/geo/geoseries.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 6, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -49,14 +49,17 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/arwas/src1/python-bigquery-dataframes/bigframes/session/_io/bigquery/read_gbq_table.py:280: DefaultIndexWarning: Table 'bigquery-public-data.geo_us_boundaries.counties' is clustered and/or partitioned, but BigQuery DataFrames was not able to find a suitable index. To avoid this warning, set at least one of: `index_col` or `filters`.\n", + "/usr/local/google/home/arwas/src1/python-bigquery-dataframes/bigframes/session/_io/bigquery/read_gbq_table.py:280: DefaultIndexWarning: \u001b[93mTable 'bigquery-public-data.geo_us_boundaries.counties' is clustered\n", + "and/or partitioned, but BigQuery DataFrames was not able to find a\n", + "suitable index. To avoid this warning, set at least one of:\n", + "`index_col` or `filters`.\u001b[0m\n", " warnings.warn(msg, category=bfe.DefaultIndexWarning)\n" ] } @@ -74,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -97,21 +100,21 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "137 POINT (-86.87338 38.37334)\n", - "164 POINT (-118.48037 46.25461)\n", - "333 POINT (-92.5617 32.30429)\n", - "703 POINT (-83.46189 39.55525)\n", - "846 POINT (-119.46779 47.21363)\n", + "217 POINT (-86.80185 38.70532)\n", + "16 POINT (-83.47042 30.44723)\n", + "40 POINT (-94.33925 38.25722)\n", + "139 POINT (-78.88532 38.50758)\n", + "400 POINT (-95.6191 41.0337)\n", "Name: int_point_geom, dtype: geometry" ] }, - "execution_count": 10, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -130,21 +133,21 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 POINT (-86.87338 38.37334)\n", - "1 POINT (-118.48037 46.25461)\n", - "2 POINT (-92.5617 32.30429)\n", - "3 POINT (-83.46189 39.55525)\n", - "4 POINT (-119.46779 47.21363)\n", + "0 POINT (-86.80185 38.70532)\n", + "1 POINT (-83.47042 30.44723)\n", + "2 POINT (-94.33925 38.25722)\n", + "3 POINT (-78.88532 38.50758)\n", + "4 POINT (-95.6191 41.0337)\n", "dtype: geometry" ] }, - "execution_count": 11, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -179,21 +182,21 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 -86.873385\n", - "1 -118.48037\n", - "2 -92.5617\n", - "3 -83.461893\n", - "4 -119.467788\n", + "0 -86.801847\n", + "1 -83.470416\n", + "2 -94.339246\n", + "3 -78.885321\n", + "4 -95.619101\n", "dtype: Float64" ] }, - "execution_count": 12, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -211,21 +214,21 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 38.373344\n", - "1 46.254606\n", - "2 32.30429\n", - "3 39.555246\n", - "4 47.213633\n", + "0 38.705322\n", + "1 30.447232\n", + "2 38.257217\n", + "3 38.507585\n", + "4 41.033703\n", "dtype: Float64" ] }, - "execution_count": 13, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -250,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -284,7 +287,7 @@ "dtype: Float64" ] }, - "execution_count": 14, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -302,7 +305,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -336,7 +339,7 @@ "dtype: Float64" ] }, - "execution_count": 15, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -361,21 +364,21 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "78 POLYGON ((-95.97154 44.6306, -95.97919 44.6305...\n", - "130 POLYGON ((-95.0933 41.77694, -95.09331 41.7764...\n", - "544 POLYGON ((-96.0664 40.43618, -96.06639 40.4352...\n", - "995 POLYGON ((-101.83583 47.49547, -101.83665 47.4...\n", - "1036 POLYGON ((-88.42474 37.15094, -88.42526 37.149...\n", + "214 POLYGON ((-79.36704 34.96248, -79.36696 34.962...\n", + "161 POLYGON ((-89.08844 33.53252, -89.08843 33.532...\n", + "57 POLYGON ((-110.75069 35.50001, -110.75069 35.4...\n", + "46 POLYGON ((-94.6865 39.04405, -94.68764 39.0440...\n", + "260 POLYGON ((-100.53965 34.99391, -100.53966 34.9...\n", "Name: county_geom, dtype: geometry" ] }, - "execution_count": 16, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -394,21 +397,21 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 POLYGON ((-95.97154 44.6306, -95.97919 44.6305...\n", - "1 POLYGON ((-95.0933 41.77694, -95.09331 41.7764...\n", - "2 POLYGON ((-96.0664 40.43618, -96.06639 40.4352...\n", - "3 POLYGON ((-101.83583 47.49547, -101.83665 47.4...\n", - "4 POLYGON ((-88.42474 37.15094, -88.42526 37.149...\n", + "0 POLYGON ((-79.36704 34.96248, -79.36696 34.962...\n", + "1 POLYGON ((-89.08844 33.53252, -89.08843 33.532...\n", + "2 POLYGON ((-110.75069 35.50001, -110.75069 35.4...\n", + "3 POLYGON ((-94.6865 39.04405, -94.68764 39.0440...\n", + "4 POLYGON ((-100.53965 34.99391, -100.53966 34.9...\n", "dtype: geometry" ] }, - "execution_count": 17, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -433,7 +436,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 13, "metadata": { "tags": [ "raises-exception" @@ -442,14 +445,14 @@ "outputs": [ { "ename": "NotImplementedError", - "evalue": "GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.38.0", + "evalue": "GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 1.40.0.", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[18], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mfive_geom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea\u001b[49m\n", - "File \u001b[0;32m~/src1/python-bigquery-dataframes/bigframes/geopandas/geoseries.py:67\u001b[0m, in \u001b[0;36mGeoSeries.area\u001b[0;34m(self, crs)\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21marea\u001b[39m(\u001b[38;5;28mself\u001b[39m, crs\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bigframes\u001b[38;5;241m.\u001b[39mseries\u001b[38;5;241m.\u001b[39mSeries: \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Returns a Series containing the area of each geometry in the GeoSeries\u001b[39;00m\n\u001b[1;32m 51\u001b[0m \u001b[38;5;124;03m expressed in the units of the CRS.\u001b[39;00m\n\u001b[1;32m 52\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;124;03m GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), insetead.\u001b[39;00m\n\u001b[1;32m 66\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 67\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\n\u001b[1;32m 68\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconstants\u001b[38;5;241m.\u001b[39mFEEDBACK_LINK\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 69\u001b[0m )\n", - "\u001b[0;31mNotImplementedError\u001b[0m: GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.38.0" + "Cell \u001b[0;32mIn[13], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mfive_geom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea\u001b[49m\n", + "File \u001b[0;32m~/src1/python-bigquery-dataframes/bigframes/geopandas/geoseries.py:67\u001b[0m, in \u001b[0;36mGeoSeries.area\u001b[0;34m(self, crs)\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21marea\u001b[39m(\u001b[38;5;28mself\u001b[39m, crs\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bigframes\u001b[38;5;241m.\u001b[39mseries\u001b[38;5;241m.\u001b[39mSeries: \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Returns a Series containing the area of each geometry in the GeoSeries\u001b[39;00m\n\u001b[1;32m 51\u001b[0m \u001b[38;5;124;03m expressed in the units of the CRS.\u001b[39;00m\n\u001b[1;32m 52\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;124;03m GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead.\u001b[39;00m\n\u001b[1;32m 66\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 67\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\n\u001b[1;32m 68\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconstants\u001b[38;5;241m.\u001b[39mFEEDBACK_LINK\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 69\u001b[0m )\n", + "\u001b[0;31mNotImplementedError\u001b[0m: GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 1.40.0." ] } ], @@ -461,12 +464,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 3. Use `bigframes.bigquery.st_area` to retirive the `area` in square meters instead. See: https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_area" + "### 3. Use `bigframes.bigquery.st_area` to retrieve the `area` in square meters instead. See: https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_area" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -475,21 +478,21 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 1865212769.084914\n", - "1 1146753653.723439\n", - "2 1059653048.84506\n", - "3 2873655557.502374\n", - "4 886267772.361455\n", + "0 1014426111.476457\n", + "1 1196896004.730286\n", + "2 25794235993.165642\n", + "3 1242002056.351685\n", + "4 2381217221.963739\n", "dtype: Float64" ] }, - "execution_count": 20, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -515,21 +518,21 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 POINT (-86.87338 38.37334)\n", - "1 POINT (-118.48037 46.25461)\n", - "2 POINT (-92.5617 32.30429)\n", - "3 POINT (-83.46189 39.55525)\n", - "4 POINT (-119.46779 47.21363)\n", + "0 POINT (-86.80185 38.70532)\n", + "1 POINT (-83.47042 30.44723)\n", + "2 POINT (-94.33925 38.25722)\n", + "3 POINT (-78.88532 38.50758)\n", + "4 POINT (-95.6191 41.0337)\n", "dtype: geometry" ] }, - "execution_count": 21, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -554,21 +557,21 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 POINT(-86.8733845 38.3733441)\n", - "1 POINT(-118.4803697 46.2546057)\n", - "2 POINT(-92.5616997 32.3042901)\n", - "3 POINT(-83.4618927 39.5552462)\n", - "4 POINT(-119.467788 47.2136328)\n", + "0 POINT(-86.8018468 38.705322)\n", + "1 POINT(-83.4704159 30.4472325)\n", + "2 POINT(-94.3392459 38.2572171)\n", + "3 POINT(-78.8853213 38.5075848)\n", + "4 POINT(-95.619101 41.0337028)\n", "dtype: string" ] }, - "execution_count": 22, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -594,21 +597,21 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 POINT (-86.87338 38.37334)\n", - "1 POINT (-118.48037 46.25461)\n", - "2 POINT (-92.5617 32.30429)\n", - "3 POINT (-83.46189 39.55525)\n", - "4 POINT (-119.46779 47.21363)\n", + "0 POINT (-86.80185 38.70532)\n", + "1 POINT (-83.47042 30.44723)\n", + "2 POINT (-94.33925 38.25722)\n", + "3 POINT (-78.88532 38.50758)\n", + "4 POINT (-95.6191 41.0337)\n", "dtype: geometry" ] }, - "execution_count": 23, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -627,7 +630,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -641,7 +644,7 @@ "dtype: geometry" ] }, - "execution_count": 24, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -662,7 +665,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -676,7 +679,7 @@ "dtype: geometry" ] }, - "execution_count": 25, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -684,6 +687,209 @@ "source": [ "geom_obj.geo.boundary" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Find the `difference` between two `GeoSeries` " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reuse `wkts_from_geo` and `geom_obj` to find the difference between the geometry objects" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 POINT (-86.80185 38.70532)\n", + "1 POINT (-83.47042 30.44723)\n", + "2 GEOMETRYCOLLECTION EMPTY\n", + "3 POINT (-78.88532 38.50758)\n", + "4 POINT (-95.6191 41.0337)\n", + "dtype: geometry" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wkts_from_geo.difference(geom_obj)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find the difference between a `GeoSeries` and a single geometry shape." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 POINT (-86.80185 38.70532)\n", + "1 None\n", + "2 None\n", + "3 None\n", + "4 None\n", + "dtype: geometry" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wkts_from_geo.difference([Polygon([(0, 0), (10, 0), (10, 10), (0, 0)])])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find the difference in `GeoSeries` with the same shapes" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 GEOMETRYCOLLECTION EMPTY\n", + "1 GEOMETRYCOLLECTION EMPTY\n", + "2 GEOMETRYCOLLECTION EMPTY\n", + "3 GEOMETRYCOLLECTION EMPTY\n", + "4 GEOMETRYCOLLECTION EMPTY\n", + "dtype: geometry" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "geom_obj.difference(geom_obj)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## You can also use`BigQuery.st_difference()` to find the difference between two `GeoSeries`. See, https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_difference" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 POINT (-86.80185 38.70532)\n", + "1 POINT (-83.47042 30.44723)\n", + "2 GEOMETRYCOLLECTION EMPTY\n", + "3 POINT (-78.88532 38.50758)\n", + "4 POINT (-95.6191 41.0337)\n", + "dtype: geometry" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bbq.st_difference(wkts_from_geo, geom_obj)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find the difference between a `GeoSeries` and a single geometry shape." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 POINT (-86.80185 38.70532)\n", + "1 None\n", + "2 None\n", + "3 None\n", + "4 None\n", + "dtype: geometry" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bbq.st_difference(wkts_from_geo, [Polygon([(0, 0), (10, 0), (10, 10), (0, 0)])])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find the difference in GeoSeries with the same shapes" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 GEOMETRYCOLLECTION EMPTY\n", + "1 GEOMETRYCOLLECTION EMPTY\n", + "2 GEOMETRYCOLLECTION EMPTY\n", + "3 GEOMETRYCOLLECTION EMPTY\n", + "4 GEOMETRYCOLLECTION EMPTY\n", + "dtype: geometry" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bbq.st_difference(geom_obj, geom_obj)" + ] } ], "metadata": { diff --git a/tests/system/small/bigquery/test_geo.py b/tests/system/small/bigquery/test_geo.py index 7d38cd7d91..538099e80a 100644 --- a/tests/system/small/bigquery/test_geo.py +++ b/tests/system/small/bigquery/test_geo.py @@ -14,7 +14,12 @@ import geopandas # type: ignore import pandas as pd -from shapely.geometry import LineString, Point, Polygon # type: ignore +from shapely.geometry import ( # type: ignore + GeometryCollection, + LineString, + Point, + Polygon, +) import bigframes.bigquery as bbq import bigframes.geopandas @@ -51,3 +56,92 @@ def test_geo_st_area(): check_exact=False, rtol=1, ) + + +def test_geo_st_difference_with_geometry_objects(): + data1 = [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(0, 0), (1, 1), (0, 1), (0, 0)]), + Point(0, 1), + ] + + data2 = [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(0, 0), (1, 1), (0, 1), (0, 0)]), + LineString([(2, 0), (0, 2)]), + ] + + geobf_s1 = bigframes.geopandas.GeoSeries(data=data1) + geobf_s2 = bigframes.geopandas.GeoSeries(data=data2) + geobf_s_result = bbq.st_difference(geobf_s1, geobf_s2).to_pandas() + + expected = bigframes.series.Series( + [ + GeometryCollection([]), + GeometryCollection([]), + Point(0, 1), + ], + index=[0, 1, 2], + dtype=geopandas.array.GeometryDtype(), + ).to_pandas() + + assert geobf_s_result.dtype == "geometry" + assert expected.iloc[0].equals(geobf_s_result.iloc[0]) + assert expected.iloc[1].equals(geobf_s_result.iloc[1]) + assert expected.iloc[2].equals(geobf_s_result.iloc[2]) + + +def test_geo_st_difference_with_single_geometry_object(): + data1 = [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(4, 2), (6, 2), (8, 6), (4, 2)]), + Point(0, 1), + ] + + geobf_s1 = bigframes.geopandas.GeoSeries(data=data1) + geobf_s_result = bbq.st_difference( + geobf_s1, + bigframes.geopandas.GeoSeries( + [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(1, 0), (0, 5), (0, 0), (1, 0)]), + ] + ), + ).to_pandas() + + expected = bigframes.series.Series( + [ + GeometryCollection([]), + Polygon([(4, 2), (6, 2), (8, 6), (4, 2)]), + None, + ], + index=[0, 1, 2], + dtype=geopandas.array.GeometryDtype(), + ).to_pandas() + + assert geobf_s_result.dtype == "geometry" + assert (expected.iloc[0]).equals(geobf_s_result.iloc[0]) + assert expected.iloc[1] == geobf_s_result.iloc[1] + assert expected.iloc[2] == geobf_s_result.iloc[2] + + +def test_geo_st_difference_with_similar_geometry_objects(): + data1 = [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(0, 0), (1, 1), (0, 1)]), + Point(0, 1), + ] + + geobf_s1 = bigframes.geopandas.GeoSeries(data=data1) + geobf_s_result = bbq.st_difference(geobf_s1, geobf_s1).to_pandas() + + expected = bigframes.series.Series( + [GeometryCollection([]), GeometryCollection([]), GeometryCollection([])], + index=[0, 1, 2], + dtype=geopandas.array.GeometryDtype(), + ).to_pandas() + + assert geobf_s_result.dtype == "geometry" + assert expected.iloc[0].equals(geobf_s_result.iloc[0]) + assert expected.iloc[1].equals(geobf_s_result.iloc[1]) + assert expected.iloc[2].equals(geobf_s_result.iloc[2]) diff --git a/tests/system/small/geopandas/test_geoseries.py b/tests/system/small/geopandas/test_geoseries.py index d0987dbdaf..fdd9826468 100644 --- a/tests/system/small/geopandas/test_geoseries.py +++ b/tests/system/small/geopandas/test_geoseries.py @@ -20,7 +20,12 @@ import google.api_core.exceptions import pandas as pd import pytest -from shapely.geometry import LineString, Point, Polygon # type: ignore +from shapely.geometry import ( # type: ignore + GeometryCollection, + LineString, + Point, + Polygon, +) import bigframes.geopandas import bigframes.series @@ -194,3 +199,93 @@ def test_geo_boundary(): check_series_type=False, check_index=False, ) + + +# the GeoSeries and GeoPandas results are not always the same. +# For example, when the difference between two polygons is empty, +# GeoPandas returns 'POLYGON EMPTY' while GeoSeries returns 'GeometryCollection([])'. +# This is why we are hard-coding the expected results. +def test_geo_difference_with_geometry_objects(): + data1 = [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(0, 0), (1, 1), (0, 1), (0, 0)]), + Point(0, 1), + ] + + data2 = [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(0, 0), (1, 1), (0, 1), (0, 0)]), + LineString([(2, 0), (0, 2)]), + ] + + bf_s1 = bigframes.geopandas.GeoSeries(data=data1) + bf_s2 = bigframes.geopandas.GeoSeries(data=data2) + + bf_result = bf_s1.difference(bf_s2).to_pandas() + + expected = bigframes.geopandas.GeoSeries( + [ + Polygon([]), + Polygon([]), + Point(0, 1), + ], + index=[0, 1, 2], + ).to_pandas() + + assert bf_result.dtype == "geometry" + assert expected.iloc[0].equals(bf_result.iloc[0]) + assert expected.iloc[1].equals(bf_result.iloc[1]) + assert expected.iloc[2].equals(bf_result.iloc[2]) + + +def test_geo_difference_with_single_geometry_object(): + data1 = [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(4, 2), (6, 2), (8, 6), (4, 2)]), + Point(0, 1), + ] + + bf_s1 = bigframes.geopandas.GeoSeries(data=data1) + bf_result = bf_s1.difference( + bigframes.geopandas.GeoSeries( + [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(1, 0), (0, 5), (0, 0), (1, 0)]), + ] + ), + ).to_pandas() + + expected = bigframes.geopandas.GeoSeries( + [ + GeometryCollection([]), + Polygon([(4, 2), (6, 2), (8, 6), (4, 2)]), + None, + ], + index=[0, 1, 2], + ).to_pandas() + + assert bf_result.dtype == "geometry" + assert (expected.iloc[0]).equals(bf_result.iloc[0]) + assert expected.iloc[1] == bf_result.iloc[1] + assert expected.iloc[2] == bf_result.iloc[2] + + +def test_geo_difference_with_similar_geometry_objects(): + data1 = [ + Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), + Polygon([(0, 0), (1, 1), (0, 1)]), + Point(0, 1), + ] + + bf_s1 = bigframes.geopandas.GeoSeries(data=data1) + bf_result = bf_s1.difference(bf_s1).to_pandas() + + expected = bigframes.geopandas.GeoSeries( + [GeometryCollection([]), GeometryCollection([]), GeometryCollection([])], + index=[0, 1, 2], + ).to_pandas() + + assert bf_result.dtype == "geometry" + assert expected.iloc[0].equals(bf_result.iloc[0]) + assert expected.iloc[1].equals(bf_result.iloc[1]) + assert expected.iloc[2].equals(bf_result.iloc[2]) diff --git a/third_party/bigframes_vendored/geopandas/geoseries.py b/third_party/bigframes_vendored/geopandas/geoseries.py index a2e7b74059..b00d4220ff 100644 --- a/third_party/bigframes_vendored/geopandas/geoseries.py +++ b/third_party/bigframes_vendored/geopandas/geoseries.py @@ -239,3 +239,116 @@ def to_wkt(self) -> bigframes.series.Series: WKT representations of the geometries. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def difference(self: GeoSeries, other: GeoSeries) -> GeoSeries: # type: ignore + """ + Returns a GeoSeries of the points in each aligned geometry that are not + in other. + + The operation works on a 1-to-1 row-wise manner + + **Examples:** + + >>> import bigframes as bpd + >>> import bigframes.geopandas + >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None + + We can check two GeoSeries against each other, row by row. + + >>> s1 = bigframes.geopandas.GeoSeries( + ... [ + ... Polygon([(0, 0), (2, 2), (0, 2)]), + ... Polygon([(0, 0), (2, 2), (0, 2)]), + ... LineString([(0, 0), (2, 2)]), + ... LineString([(2, 0), (0, 2)]), + ... Point(0, 1), + ... ], + ... ) + >>> s2 = bigframes.geopandas.GeoSeries( + ... [ + ... Polygon([(0, 0), (1, 1), (0, 1)]), + ... LineString([(1, 0), (1, 3)]), + ... LineString([(2, 0), (0, 2)]), + ... Point(1, 1), + ... Point(0, 1), + ... ], + ... index=range(1, 6), + ... ) + + >>> s1 + 0 POLYGON ((0 0, 2 2, 0 2, 0 0)) + 1 POLYGON ((0 0, 2 2, 0 2, 0 0)) + 2 LINESTRING (0 0, 2 2) + 3 LINESTRING (2 0, 0 2) + 4 POINT (0 1) + dtype: geometry + + >>> s2 + 1 POLYGON ((0 0, 1 1, 0 1, 0 0)) + 2 LINESTRING (1 0, 1 3) + 3 LINESTRING (2 0, 0 2) + 4 POINT (1 1) + 5 POINT (0 1) + dtype: geometry + + >>> s1.difference(s2) + 0 None + 1 POLYGON ((0.99954 1, 2 2, 0 2, 0 1, 0.99954 1)) + 2 LINESTRING (0 0, 1 1.00046, 2 2) + 3 GEOMETRYCOLLECTION EMPTY + 4 POINT (0 1) + 5 None + dtype: geometry + + We can also check difference of single shapely geometries: + + >>> sbq1 = bigframes.geopandas.GeoSeries( + ... [ + ... Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]) + ... ] + ... ) + >>> sbq2 = bigframes.geopandas.GeoSeries( + ... [ + ... Polygon([(4, 2), (6, 2), (8, 6), (4, 2)]) + ... ] + ... ) + + >>> sbq1 + 0 POLYGON ((0 0, 10 0, 10 10, 0 0)) + dtype: geometry + + >>> sbq2 + 0 POLYGON ((4 2, 6 2, 8 6, 4 2)) + dtype: geometry + + >>> sbq1.difference(sbq2) + 0 POLYGON ((0 0, 10 0, 10 10, 0 0), (8 6, 6 2, 4... + dtype: geometry + + Additionally, we can check difference of a GeoSeries against a single shapely geometry: + + >>> s1.difference(sbq2) + 0 POLYGON ((0 0, 2 2, 0 2, 0 0)) + 1 None + 2 None + 3 None + 4 None + dtype: geometry + + Args: + other (GeoSeries or geometric object): + The GeoSeries (elementwise) or geometric object to find the + difference to. + + Returns: + bigframes.geopandas.GeoSeries: + A GeoSeries of the points in each aligned geometry that are not + in other. + + Raises: + NotImplementedError: + GeoSeries.difference is not supported. Use + bigframes.bigquery.st_difference(series), instead. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)