diff --git a/.github/workflows/python-wheel.yml b/.github/workflows/python-wheel.yml index eaea87d611..2297f39b08 100644 --- a/.github/workflows/python-wheel.yml +++ b/.github/workflows/python-wheel.yml @@ -54,6 +54,9 @@ jobs: if: runner.os == 'Linux' uses: docker/setup-qemu-action@v3 with: + # temporarily pin to qemu@v8 to workaround non-determininstic gcc segfaults + # https://github.com/docker/setup-qemu-action/issues/188 + image: tonistiigi/binfmt:qemu-v8.1.5 platforms: all - name: Build wheels uses: pypa/cibuildwheel@v2.22.0 diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index aaca28df05..6aad4a97b7 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -163,13 +163,9 @@ jobs: - name: Run Spark Connect tests env: PYTHON_VERSION: ${{ matrix.python }} + SPARK_VERSION: ${{ matrix.spark }} + if: ${{ matrix.spark >= '3.4.0' }} run: | - if [ ! -f "${VENV_PATH}/lib/python${PYTHON_VERSION}/site-packages/pyspark/sbin/start-connect-server.sh" ] - then - echo "Skipping connect tests for Spark $SPARK_VERSION" - exit - fi - export SPARK_HOME=${VENV_PATH}/lib/python${PYTHON_VERSION}/site-packages/pyspark export SPARK_REMOTE=local diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f5df2b2a96..af5094cb4f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,12 +26,14 @@ repos: - repo: meta hooks: - id: identity + name: run identity check - id: check-hooks-apply + name: run check hooks apply - repo: https://github.com/Lucas-C/pre-commit-hooks rev: v1.5.5 hooks: - id: insert-license - name: Add license for all Markdown files + name: add license for all Markdown files files: \.md$ args: - --comment-style @@ -41,7 +43,7 @@ repos: - --fuzzy-match-generates-todo exclude: ^docs/index\.md$|^\.github/pull_request_template\.md$|\.github/issue_template\.md$ - id: insert-license - name: Add license for all Makefile files + name: add license for all Makefile files files: ^Makefile$ args: - --comment-style @@ -50,7 +52,7 @@ repos: - .github/workflows/license-templates/LICENSE.txt - --fuzzy-match-generates-todo - id: insert-license - name: Add license for all TOML files + name: add license for all TOML files files: \.toml$ args: - --comment-style @@ -59,7 +61,7 @@ repos: - .github/workflows/license-templates/LICENSE.txt - --fuzzy-match-generates-todo - id: insert-license - name: Add license for all YAML files + name: add license for all YAML files files: \.ya?ml$ args: - --comment-style @@ -71,33 +73,43 @@ repos: rev: 24.10.0 hooks: - id: black-jupyter + name: run black-jupyter + description: format Python files and Jupyter Notebooks with black - repo: https://github.com/pre-commit/mirrors-clang-format rev: v19.1.4 hooks: - id: clang-format + name: run clang-format + description: format C files with clang-format args: [--style=Google] types_or: [c] - repo: https://github.com/PyCQA/bandit rev: 1.7.10 hooks: - id: bandit + name: run bandit + description: check Python code for security issues args: ["-c=pyproject.toml", "-r"] - repo: https://github.com/codespell-project/codespell rev: v2.3.0 hooks: - id: codespell - name: Run codespell - description: Check spelling with codespell + name: run codespell + description: check spelling with codespell args: [--ignore-words=.github/linters/codespell.txt] exclude: ^docs/image|^spark/common/src/test/resources|^docs/usecases|^tools/maven/scalafmt - repo: https://github.com/gitleaks/gitleaks rev: v8.21.2 hooks: - id: gitleaks + name: run gitleaks + description: check for secrets with gitleaks - repo: https://github.com/shssoichiro/oxipng rev: v9.1.2 hooks: - id: oxipng + name: run oxipng + description: check PNG files with oxipng args: ["-o", "4", "--strip", "safe", "--alpha"] - repo: https://github.com/pre-commit/pre-commit-hooks rev: v5.0.0 @@ -151,8 +163,8 @@ repos: rev: v0.43.0 hooks: - id: markdownlint - name: Run markdownlint - description: Check Markdown files with markdownlint + name: run markdownlint + description: check Markdown files with markdownlint args: [--config=.github/linters/.markdown-lint.yml] exclude: ^\.github/.*$ types: [markdown] @@ -161,12 +173,14 @@ repos: rev: v0.10.0.1 hooks: - id: shellcheck + name: run shellcheck + description: check Shell scripts with shellcheck - repo: https://github.com/adrienverge/yamllint rev: v1.35.1 hooks: - id: yamllint - name: Run yamllint - description: Check YAML files with yamllint + name: run yamllint + description: check YAML files with yamllint args: [--strict, -c=.github/linters/.yaml-lint.yml] types: [yaml] files: \.ya?ml$ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000000..c87a36b73a --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,44 @@ + + +# How to contribute to Apache Sedona + +Welcome! We'd love to have you contribute to Apache Sedona! + +## Did you find a bug? + +Create an issue with a reproducible example. Please specify the Sedona version, Java version, code snippet, and error message. + +## Did you create a PR to fix a bug? + +See [here](https://sedona.apache.org/latest/community/rule/#make-a-pull-request) for instructions on how to open PRs. + +We appreciate bug fixes - thank you in advance! + +## Would you like to add a new feature or change existing code? + +If you would like to add a feature or change existing behavior, please make sure to create an issue/JIRA ticket and get the planned work approved by the core team first! + +It's always better to get aligned with the core devs before writing any code. + +## Do you have questions about the source code? + +Feel free to create an issue or join the [Discord](https://share.hsforms.com/1Ndql_ZigTdmLlVQc_d1o4gqga4q) with questions! + +Thanks for reading and looking forward to collaborating with you! diff --git a/R/R/data_interface.R b/R/R/data_interface.R index fabe885a98..e04782340b 100644 --- a/R/R/data_interface.R +++ b/R/R/data_interface.R @@ -435,7 +435,7 @@ spark_read_shapefile <- function(sc, lapply(names(options), function(name) { if (!name %in% c("")) { - warning(paste0("Ignoring unknown option '", name,"'")) + warning(paste0("Ignoring unknown option '", name, "'")) } }) @@ -470,7 +470,7 @@ spark_read_geojson <- function(sc, if ("skip_syntactically_invalid_geometries" %in% names(options)) final_skip <- options[["skip_syntactically_invalid_geometries"]] else final_skip <- TRUE lapply(names(options), function(name) { if (!name %in% c("allow_invalid_geometries", "skip_syntactically_invalid_geometries")) { - warning(paste0("Ignoring unknown option '", name,"'")) + warning(paste0("Ignoring unknown option '", name, "'")) } }) diff --git a/R/tests/testthat/test-data-interface-raster.R b/R/tests/testthat/test-data-interface-raster.R index 67a1e6a0af..ec67ad1ee5 100644 --- a/R/tests/testthat/test-data-interface-raster.R +++ b/R/tests/testthat/test-data-interface-raster.R @@ -33,7 +33,7 @@ test_that("Passed RS_FromGeoTiff from binary", { mutate(raster = RS_FromGeoTiff(content)) expect_equal( - raster_sdf %>% sdf_schema() , + raster_sdf %>% sdf_schema(), list( path = list(name = "path", type = "StringType"), modificationTime = list(name = "modificationTime", type = "TimestampType"), @@ -65,7 +65,7 @@ test_that("Passed RS_FromArcInfoAsciiGrid from binary", { mutate(raster = RS_FromArcInfoAsciiGrid(content)) expect_equal( - raster_sdf %>% sdf_schema() , + raster_sdf %>% sdf_schema(), list( path = list(name = "path", type = "StringType"), modificationTime = list(name = "modificationTime", type = "TimestampType"), @@ -101,7 +101,7 @@ test_that("Passed RS_Envelope with raster", { ) expect_equal( - raster_sdf %>% sdf_schema() , + raster_sdf %>% sdf_schema(), list( path = list(name = "path", type = "StringType"), modificationTime = list(name = "modificationTime", type = "TimestampType"), diff --git a/R/vignettes/articles/apache-sedona.Rmd b/R/vignettes/articles/apache-sedona.Rmd index f5fda1b94d..da73e4c38c 100644 --- a/R/vignettes/articles/apache-sedona.Rmd +++ b/R/vignettes/articles/apache-sedona.Rmd @@ -447,7 +447,7 @@ Or change at runtime: ```{r} spark_session(sc) %>% invoke("conf") %>% - invoke("set", "sedona.global.index","false") + invoke("set", "sedona.global.index", "false") invoke_new(sc, "org.apache.sedona.core.utils.SedonaConf", invoke(spark_session(sc), "conf")) ``` diff --git a/R/vignettes/articles/raster.Rmd b/R/vignettes/articles/raster.Rmd index 7d2b8ea7fd..832fb0403e 100644 --- a/R/vignettes/articles/raster.Rmd +++ b/R/vignettes/articles/raster.Rmd @@ -107,7 +107,7 @@ dir(dest_file, recursive = TRUE) Available options see [Raster writer](../../../api/sql/Raster-writer/): * rasterField: the binary column to be saved (if there is only one takes that column by default, otherwise specify) -* fileExtension: `.tiff` bvy default, also accepts `.png`, `.jpeg`, `.asc` +* fileExtension: `.tiff` by default, also accepts `.png`, `.jpeg`, `.asc` * pathField: if used any column name that indicates the paths of each raster file, otherwise random UUIDs are generated. ```{r} diff --git a/common/src/main/java/org/apache/sedona/common/Functions.java b/common/src/main/java/org/apache/sedona/common/Functions.java index 6122169325..b5a181aa29 100644 --- a/common/src/main/java/org/apache/sedona/common/Functions.java +++ b/common/src/main/java/org/apache/sedona/common/Functions.java @@ -80,6 +80,123 @@ public static double area(Geometry geometry) { return geometry.getArea(); } + public static Geometry labelPoint(Geometry geometry) { + return labelPoint(geometry, 16, 0.2); + } + + public static Geometry labelPoint(Geometry geometry, int gridResolution) { + return labelPoint(geometry, gridResolution, 0.2); + } + + public static Geometry labelPoint( + Geometry geometry, int gridResolution, double goodnessThreshold) { + if (geometry.getArea() <= 0) { + throw new IllegalArgumentException("Geometry must have a positive area"); + } + + GeometryFactory geometryFactory = new GeometryFactory(); + + // Find the largest polygon + Polygon largestPolygon = findLargestPolygon(geometry); + + if (largestPolygon == null) { + throw new IllegalArgumentException("Geometry must contain at least one Polygon"); + } + + return polygonToLabel(largestPolygon, gridResolution, goodnessThreshold, geometryFactory); + } + + private static Polygon findLargestPolygon(Geometry geometry) { + if (geometry instanceof Polygon) { + return (Polygon) geometry; + } + + if (geometry instanceof GeometryCollection) { + GeometryCollection gc = (GeometryCollection) geometry; + Polygon largestPolygon = null; + double maxArea = Double.MIN_VALUE; + + for (int i = 0; i < gc.getNumGeometries(); i++) { + Geometry subGeometry = gc.getGeometryN(i); + Polygon candidate = findLargestPolygon(subGeometry); + + if (candidate != null && candidate.getArea() > maxArea) { + largestPolygon = candidate; + maxArea = candidate.getArea(); + } + } + return largestPolygon; + } + + return null; + } + + private static Point polygonToLabel( + Polygon polygon, + int gridResolution, + double goodnessThreshold, + GeometryFactory geometryFactory) { + if (polygon.getArea() <= 0) { + throw new IllegalArgumentException("Polygon must have a positive area"); + } + + Envelope env = polygon.getEnvelopeInternal(); + double xmin = env.getMinX(); + double ymin = env.getMinY(); + double xmax = env.getMaxX(); + double ymax = env.getMaxY(); + + // Calculate step size based on grid resolution + double stepSizeX = (xmax - xmin) / gridResolution; + double stepSizeY = (ymax - ymin) / gridResolution; + + Point centroid = polygon.getCentroid(); + double radius = Math.sqrt(polygon.getArea() / Math.PI); + goodnessThreshold = radius * goodnessThreshold; + + double bestGoodness = labelGoodness(polygon, centroid); + + if (bestGoodness < goodnessThreshold) { + for (int x = 0; x < gridResolution; x++) { + for (int y = 0; y < gridResolution; y++) { + double candidateX = xmin + x * stepSizeX; + double candidateY = ymin + y * stepSizeY; + Point candidate = geometryFactory.createPoint(new Coordinate(candidateX, candidateY)); + + double candidateGoodness = labelGoodness(polygon, candidate); + + if (candidateGoodness > bestGoodness) { + centroid = candidate; + bestGoodness = candidateGoodness; + } + } + } + } + + return centroid; + } + + private static double labelGoodness(Geometry geometry, Point point) { + if (!geometry.intersects(point)) { + return 0.0; + } + + double closest = Double.POSITIVE_INFINITY; + Coordinate[] coordinates = geometry.getCoordinates(); + + for (Coordinate coord : coordinates) { + double dx = coord.x - point.getX(); + double dy = coord.y - point.getY(); + double distanceSquared = dx * dx + dy * dy; + + if (distanceSquared < closest) { + closest = distanceSquared; + } + } + + return Math.sqrt(closest); + } + public static double azimuth(Geometry left, Geometry right) { Coordinate leftCoordinate = left.getCoordinate(); Coordinate rightCoordinate = right.getCoordinate(); @@ -1039,6 +1156,36 @@ public static Geometry lineMerge(Geometry geometry) { return geometry.getFactory().createGeometryCollection(); } + public static Geometry[] lineSegments(Geometry geometry, boolean lenient) { + if (!(geometry instanceof LineString)) { + if (lenient) { + return new Geometry[] {}; + } else { + throw new IllegalArgumentException( + "Geometry is not a LineString. This function expects input geometry to be a LineString."); + } + } + + LineString line = (LineString) geometry; + Coordinate[] coords = line.getCoordinates(); + if (coords.length == 2 || coords.length == 0) { + return new Geometry[] {line}; + } + + GeometryFactory geometryFactory = geometry.getFactory(); + Geometry[] resultArray = new Geometry[coords.length - 1]; + for (int i = 1; i < coords.length; i++) { + resultArray[i - 1] = + geometryFactory.createLineString(new Coordinate[] {coords[i - 1], coords[i]}); + } + + return resultArray; + } + + public static Geometry[] lineSegments(Geometry geometry) { + return lineSegments(geometry, true); + } + public static Geometry minimumBoundingCircle(Geometry geometry, int quadrantSegments) { MinimumBoundingCircle minimumBoundingCircle = new MinimumBoundingCircle(geometry); Coordinate centre = minimumBoundingCircle.getCentre(); diff --git a/common/src/main/java/org/apache/sedona/common/utils/GeometryDuplicateCoordinateRemover.java b/common/src/main/java/org/apache/sedona/common/utils/GeometryDuplicateCoordinateRemover.java index 0838b75a3d..9f185d77fd 100644 --- a/common/src/main/java/org/apache/sedona/common/utils/GeometryDuplicateCoordinateRemover.java +++ b/common/src/main/java/org/apache/sedona/common/utils/GeometryDuplicateCoordinateRemover.java @@ -33,7 +33,7 @@ public static Coordinate[] removeDuplicates(Coordinate[] coords, int minPoints) double distance = Double.MAX_VALUE; - if (numPoint <= minPoints) return new Coordinate[0]; + if (numPoint < minPoints) return new Coordinate[0]; Coordinate lastPoint = coords[0]; int writeIndex = 1; diff --git a/common/src/test/java/org/apache/sedona/common/FunctionsTest.java b/common/src/test/java/org/apache/sedona/common/FunctionsTest.java index 85efbf17e2..642efe0216 100644 --- a/common/src/test/java/org/apache/sedona/common/FunctionsTest.java +++ b/common/src/test/java/org/apache/sedona/common/FunctionsTest.java @@ -63,6 +63,41 @@ protected int compareCoordinate( private final WKTReader wktReader = new WKTReader(); + @Test + public void labelPoint() throws Exception { + Geometry geom = + Constructors.geomFromWKT( + "POLYGON ((-112.637484 33.440546, -112.546852 33.477209, -112.489177 33.550488, -112.41777 33.751684, -111.956371 33.719707, -111.766868 33.616843, -111.775107 33.527595, -111.640533 33.504695, -111.440044 33.463462, -111.415326 33.374055, -111.514197 33.309809, -111.643279 33.222542, -111.893203 33.174278, -111.96461 33.250109, -112.123903 33.261593, -112.252985 33.35341, -112.406784 33.346527, -112.667694 33.316695, -112.637484 33.440546))", + 4326); + String labelPoint = Functions.asEWKT(Functions.labelPoint(geom)); + String expected = "SRID=4326;POINT (-112.04278737349767 33.46420809489905)"; + assertEquals(expected, labelPoint); + + geom = + Constructors.geomFromWKT( + "GEOMETRYCOLLECTION(POLYGON ((-112.840785 33.435962, -112.840785 33.708284, -112.409597 33.708284, -112.409597 33.435962, -112.840785 33.435962)), POLYGON ((-112.309264 33.398167, -112.309264 33.746007, -111.787444 33.746007, -111.787444 33.398167, -112.309264 33.398167)))", + 4326); + labelPoint = Functions.asEWKT(Functions.labelPoint(geom, 32)); + expected = "SRID=4326;POINT (-112.04835399999999 33.57208699999999)"; + assertEquals(expected, labelPoint); + + geom = + Constructors.geomFromWKT( + "POLYGON ((-112.654072 33.114485, -112.313516 33.653431, -111.63515 33.314399, -111.497829 33.874913, -111.692825 33.431378, -112.376684 33.788215, -112.654072 33.114485))", + 4326); + labelPoint = Functions.asEWKT(Functions.labelPoint(geom, 32, 0.01)); + expected = "SRID=4326;POINT (-112.0722602222832 33.53914975012836)"; + assertEquals(expected, labelPoint); + + geom = + Constructors.geomFromWKT( + "GEOMETRYCOLLECTION(GEOMETRYCOLLECTION(POLYGON ((-112.840785 33.435962, -112.840785 33.708284, -112.409597 33.708284, -112.409597 33.435962, -112.840785 33.435962)), POLYGON ((-112.309264 33.398167, -112.309264 33.746007, -111.787444 33.746007, -111.787444 33.398167, -112.309264 33.398167))), POLYGON ((-113.001222 33.223156, -112.991385 33.565242, -112.650316 33.452315, -113.001222 33.223156)))", + 4326); + labelPoint = Functions.asEWKT(Functions.labelPoint(geom)); + expected = "SRID=4326;POINT (-112.04835399999999 33.57208699999999)"; + assertEquals(expected, labelPoint); + } + @Test public void asEWKT() throws Exception { GeometryFactory geometryFactory = new GeometryFactory(new PrecisionModel(), 4236); @@ -1819,6 +1854,24 @@ public void removeRepeatedPointsMultiPoint() throws ParseException { actual = Functions.asWKT(Functions.removeRepeatedPoints(geom, 2000)); expected = "MULTIPOINT ((1 1))"; assertEquals(expected, actual); + + // The minimum number of coordinates in valid geometry shouldn't result in an empty geometry + geom = Constructors.geomFromWKT("POLYGON ((40 40, 70 70, 70 70, 40 40))", 0); + actual = Functions.asWKT(Functions.removeRepeatedPoints(geom)); + expected = "POLYGON ((40 40, 70 70, 70 70, 40 40))"; + assertEquals(expected, actual); + + geom = + Constructors.geomFromWKT( + "POLYGON ((40 40, 70 70, 70 70, 40 40), (40 40, 70 70, 50 50, 70 70, 40 40))", 0); + actual = Functions.asWKT(Functions.removeRepeatedPoints(geom)); + expected = "POLYGON ((40 40, 70 70, 70 70, 40 40), (40 40, 70 70, 50 50, 70 70, 40 40))"; + assertEquals(expected, actual); + + geom = Constructors.geomFromWKT("LINESTRING(0 0, 1 1)", 0); + actual = Functions.asWKT(Functions.removeRepeatedPoints(geom)); + expected = "LINESTRING (0 0, 1 1)"; + assertEquals(expected, actual); } @Test @@ -2378,6 +2431,45 @@ public void makeLineWithWrongType() { "ST_MakeLine only supports Point, MultiPoint and LineString geometries", e.getMessage()); } + @Test + public void lineSegments() throws ParseException { + Geometry geom = Constructors.geomFromWKT("LINESTRING (0 0, 1 1, 2 2, 3 3, 3 4)", 0); + Geometry[] actual = Functions.lineSegments(geom, false); + int actualSize = actual.length; + int expectedSize = 4; + assertEquals(expectedSize, actualSize); + + geom = Constructors.geomFromWKT("LINESTRING (0 0, 1 1)", 0); + actual = Functions.lineSegments(geom); + actualSize = actual.length; + expectedSize = 1; + assertEquals(expectedSize, actualSize); + + geom = Constructors.geomFromWKT("LINESTRING (0 0, 1 1, 2 2, 3 3, 3 4, 4 4)", 4326); + actual = Functions.lineSegments(geom); + actualSize = actual.length; + expectedSize = 5; + assertEquals(expectedSize, actualSize); + + // Check SRID + Geometry resultCheck = actual[0]; + assertEquals(4326, resultCheck.getSRID()); + + geom = GEOMETRY_FACTORY.createLineString(); + actual = Functions.lineSegments(geom); + String actualString = Arrays.toString(actual); + String expectedString = "[LINESTRING EMPTY]"; + assertEquals(expectedString, actualString); + + geom = + Constructors.geomFromWKT( + "POLYGON ((65.10498 18.625425, 62.182617 16.36231, 64.863281 16.40447, 62.006836 14.157882, 65.522461 14.008696, 65.10498 18.625425))", + 0); + actual = Functions.lineSegments(geom, true); + actualSize = actual.length; + assertEquals(0, actualSize); + } + @Test public void minimumBoundingRadius() { Point point = GEOMETRY_FACTORY.createPoint(new Coordinate(0, 0)); diff --git a/common/src/test/java/org/apache/sedona/common/raster/RasterConstructorsTest.java b/common/src/test/java/org/apache/sedona/common/raster/RasterConstructorsTest.java index deb4cce1b2..b32cd600a9 100644 --- a/common/src/test/java/org/apache/sedona/common/raster/RasterConstructorsTest.java +++ b/common/src/test/java/org/apache/sedona/common/raster/RasterConstructorsTest.java @@ -76,13 +76,6 @@ public void fromGeoTiff() throws IOException, FactoryException { assertEquals(4, gridCoverage2D.getNumSampleDimensions()); } - @Test - public void profileAsRaster() throws FactoryException, ParseException { - for (int i = 0; i < 1000000; i++) { - testAsRasterWithEmptyRaster(); - } - } - @Test public void testAsRasterWithEmptyRaster() throws FactoryException, ParseException { // Polygon diff --git a/docs-overrides/main.html b/docs-overrides/main.html index c9a301f2e9..ed0b9639af 100644 --- a/docs-overrides/main.html +++ b/docs-overrides/main.html @@ -5,20 +5,25 @@ {% block outdated %} You're not viewing the latest stable version. - + Click here to go to the latest stable version. {% endblock %} +{% block extrahead %} + {{ super() }} + +{% endblock %} + {% block content %} {% if page.is_homepage %}
@@ -41,7 +46,7 @@ Apache Sedona Discord server

- +



@@ -56,35 +61,35 @@

Sedona Ecosystem





-

High Speed

+

High Speed

According to our benchmark and third-party research papers, Sedona runs 2X - 10X faster than other Spark-based geospatial data systems on computation-intensive query workloads.
-
+
-
Execution time of spatial join with polygons
-
+
Execution time of spatial join with polygons
+
-
+




-

Low Memory Consumption

+

Low Memory Consumption

According to our benchmark and third-party research papers, Sedona has 50% less peak memory consumption than other Spark-based geospatial data systems for large-scale in-memory query processing.
-
+
-
Peak memory consumption of spatial join with polygons
-
+
Peak memory consumption of spatial join with polygons
+
-
+




@@ -92,21 +97,21 @@

Low Memory Consumption

Ease of Use

-
+
Sedona offers Scala, Java, Spatial SQL, Python, and R APIs and integrates them into underlying system kernels with care. You can simply create spatial analytics and data mining applications and run them in any cloud environments.
-
+
SELECT superhero.name
 FROM city, superhero
 WHERE ST_Contains(city.geom, superhero.geom)
 AND city.name = 'Gotham'
-
+




- - + +
diff --git a/docs/api/flink/Function.md b/docs/api/flink/Function.md index ab5a3cdda5..8446bc5a69 100644 --- a/docs/api/flink/Function.md +++ b/docs/api/flink/Function.md @@ -183,6 +183,80 @@ Input: `POLYGON ((1 0 1, 1 1 1, 2 2 2, 1 0 1))` Output: `POLYGON Z((2 3 1, 4 5 1, 7 8 2, 2 3 1))` +## ST_LabelPoint + +Introduction: `ST_LabelPoint` computes and returns a label point for a given polygon or geometry collection. The label point is chosen to be sufficiently far from boundaries of the geometry. For a regular Polygon this will be the +centroid. + +The algorithm is derived from Tippecanoe’s `polygon_to_anchor`, an approximate solution for label point generation, designed to be faster than optimal algorithms like `polylabel`. It searches for a “good enough” label point within a limited number of iterations. For geometry collections, only the largest Polygon by area is considered. While `ST_Centroid` is a fast algorithm to calculate the center of mass of a (Multi)Polygon, it may place the point outside of the Polygon or near a boundary for concave shapes, polygons with holes, or MultiPolygons. + +`ST_LabelPoint` takes up to 3 arguments, + +- `geometry`: input geometry (e.g., a Polygon or GeometryCollection) for which the anchor point is to be calculated. +- `gridResolution` (Optional, default is 16): Controls the resolution of the search grid for refining the label point. A higher resolution increases the grid density, providing a higher chance of finding a good enough result at the cost of runtime. For example, a gridResolution of 16 divides the bounding box of the polygon into a 16x16 grid. +- `goodnessThreshold` (Optional, default is 0.2): Determines the minimum acceptable “goodness” value for the anchor point. Higher thresholds prioritize points farther from boundaries but may require more computation. + +!!!note + - `ST_LabelPoint` throws an `IllegalArgumentException` if the input geometry has an area of zero or less. + - Holes within polygons are respected. Points within a hole are given a goodness of 0. + - For GeometryCollections, only the largest polygon by area is considered. + +!!!tip + - Use `ST_LabelPoint` for tasks such as label placement, identifying representative points for polygons, or other spatial analyses where an internal reference point is preferred but not required. If intersection of the point and the original geometry is required, use of an algorithm like `polylabel` should be considered. + - `ST_LabelPoint` offers a faster, approximate solution for label point generation, making it ideal for large datasets or real-time applications. + +Format: + +```sql +ST_LabelPoint(geometry: Geometry) +``` + +```sql +ST_LabelPoint(geometry: Geometry, gridResolution: Integer) +``` + +```sql +ST_LabelPoint(geometry: Geometry, gridResolution: Integer, goodnessThreshold: Double) +``` + +Since: `v1.7.1` + +SQL Example: + +``` +SELECT ST_LabelPoint(ST_GeomFromWKT('POLYGON((0 0, 4 0, 4 4, 0 4, 0 0))')) +``` + +Output: + +``` +POINT (2 2) +``` + +SQL Example: + +``` +SELECT ST_LabelPoint(ST_GeomFromWKT('GEOMETRYCOLLECTION(POLYGON ((-112.840785 33.435962, -112.840785 33.708284, -112.409597 33.708284, -112.409597 33.435962, -112.840785 33.435962)), POLYGON ((-112.309264 33.398167, -112.309264 33.746007, -111.787444 33.746007, -111.787444 33.398167, -112.309264 33.398167)))')) +``` + +Output: + +``` +POINT (-112.04835399999999 33.57208699999999) +``` + +SQL Example: + +``` +SELECT ST_LabelPoint(ST_GeomFromWKT('POLYGON ((-112.654072 33.114485, -112.313516 33.653431, -111.63515 33.314399, -111.497829 33.874913, -111.692825 33.431378, -112.376684 33.788215, -112.654072 33.114485))', 4326)) +``` + +Output: + +``` +SRID=4326;POINT (-112.0722602222832 33.53914975012836) +``` + ## ST_Angle Introduction: Compute and return the angle between two vectors represented by the provided points or linestrings. @@ -2517,6 +2591,47 @@ Output: LINESTRING (-29 -27, -30 -29.7, -45 -33, -46 -32) ``` +## ST_LineSegments + +Introduction: This function transforms a LineString containing multiple coordinates into an array of LineStrings, each with precisely two coordinates. The `lenient` argument, true by default, prevents an exception from being raised if the input geometry is not a LineString. + +Format: + +`ST_LineSegments(geom: Geometry, lenient: Boolean)` + +`ST_LineSegments(geom: Geometry)` + +Since: `v1.7.1` + +SQL Example: + +```sql +SELECT ST_LineSegments( + ST_GeomFromWKT('LINESTRING(0 0, 10 10, 20 20, 30 30, 40 40, 50 50)'), + false + ) +``` + +Output: + +``` +[LINESTRING (0 0, 10 10), LINESTRING (10 10, 20 20), LINESTRING (20 20, 30 30), LINESTRING (30 30, 40 40), LINESTRING (40 40, 50 50)] +``` + +SQL Example: + +```sql +SELECT ST_LineSegments( + ST_GeomFromWKT('POLYGON((0 0, 10 0, 10 10, 0 10, 0 0))') + ) +``` + +Output: + +``` +[] +``` + ## ST_LineSubstring Introduction: Return a linestring being a substring of the input one starting and ending at the given fractions of total 2d length. Second and third arguments are Double values between 0 and 1. This only works with LINESTRINGs. @@ -3117,6 +3232,54 @@ Output: 2216860.5497177234 ``` +## ST_Perimeter2D + +Introduction: This function calculates the 2D perimeter of a given geometry. It supports Polygon, MultiPolygon, and GeometryCollection geometries (as long as the GeometryCollection contains polygonal geometries). For other types, it returns 0. To measure lines, use [ST_Length](#st_length). + +To get the perimeter in meters, set `use_spheroid` to `true`. This calculates the geodesic perimeter using the WGS84 spheroid. When using `use_spheroid`, the `lenient` parameter defaults to true, assuming the geometry uses EPSG:4326. To throw an exception instead, set `lenient` to `false`. + +!!!Info + This function is an alias for [ST_Perimeter](#st_perimeter). + +Format: + +`ST_Perimeter2D(geom: Geometry)` + +`ST_Perimeter2D(geom: Geometry, use_spheroid: Boolean)` + +`ST_Perimeter2D(geom: Geometry, use_spheroid: Boolean, lenient: Boolean = True)` + +Since: `v1.7.1` + +SQL Example: + +```sql +SELECT ST_Perimeter2D( + ST_GeomFromText('POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))') +) +``` + +Output: + +``` +20.0 +``` + +SQL Example: + +```sql +SELECT ST_Perimeter2D( + ST_GeomFromText('POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))', 4326), + true, false +) +``` + +Output: + +``` +2216860.5497177234 +``` + ## ST_PointN Introduction: Return the Nth point in a single linestring or circular linestring in the geometry. Negative values are counted backwards from the end of the LineString, so that -1 is the last point. Returns NULL if there is no linestring in the geometry. diff --git a/docs/api/snowflake/vector-data/Function.md b/docs/api/snowflake/vector-data/Function.md index bb32974120..f6b8c447f2 100644 --- a/docs/api/snowflake/vector-data/Function.md +++ b/docs/api/snowflake/vector-data/Function.md @@ -146,6 +146,78 @@ Input: `POLYGON ((1 0 1, 1 1 1, 2 2 2, 1 0 1))` Output: `POLYGON Z((2 3 1, 4 5 1, 7 8 2, 2 3 1))` +## ST_LabelPoint + +Introduction: `ST_LabelPoint` computes and returns a label point for a given polygon or geometry collection. The label point is chosen to be sufficiently far from boundaries of the geometry. For a regular Polygon this will be the +centroid. + +The algorithm is derived from Tippecanoe’s `polygon_to_anchor`, an approximate solution for label point generation, designed to be faster than optimal algorithms like `polylabel`. It searches for a “good enough” label point within a limited number of iterations. For geometry collections, only the largest Polygon by area is considered. While `ST_Centroid` is a fast algorithm to calculate the center of mass of a (Multi)Polygon, it may place the point outside of the Polygon or near a boundary for concave shapes, polygons with holes, or MultiPolygons. + +`ST_LabelPoint` takes up to 3 arguments, + +- `geometry`: input geometry (e.g., a Polygon or GeometryCollection) for which the anchor point is to be calculated. +- `gridResolution` (Optional, default is 16): Controls the resolution of the search grid for refining the label point. A higher resolution increases the grid density, providing a higher chance of finding a good enough result at the cost of runtime. For example, a gridResolution of 16 divides the bounding box of the polygon into a 16x16 grid. +- `goodnessThreshold` (Optional, default is 0.2): Determines the minimum acceptable “goodness” value for the anchor point. Higher thresholds prioritize points farther from boundaries but may require more computation. + +!!!note + - `ST_LabelPoint` throws an `IllegalArgumentException` if the input geometry has an area of zero or less. + - Holes within polygons are respected. Points within a hole are given a goodness of 0. + - For GeometryCollections, only the largest polygon by area is considered. + +!!!tip + - Use `ST_LabelPoint` for tasks such as label placement, identifying representative points for polygons, or other spatial analyses where an internal reference point is preferred but not required. If intersection of the point and the original geometry is required, use of an algorithm like `polylabel` should be considered. + - `ST_LabelPoint` offers a faster, approximate solution for label point generation, making it ideal for large datasets or real-time applications. + +Format: + +```sql +ST_LabelPoint(geometry: Geometry) +``` + +```sql +ST_LabelPoint(geometry: Geometry, gridResolution: Integer) +``` + +```sql +ST_LabelPoint(geometry: Geometry, gridResolution: Integer, goodnessThreshold: Double) +``` + +SQL Example: + +``` +SELECT ST_LabelPoint(ST_GeomFromWKT('POLYGON((0 0, 4 0, 4 4, 0 4, 0 0))')) +``` + +Output: + +``` +POINT (2 2) +``` + +SQL Example: + +``` +SELECT ST_LabelPoint(ST_GeomFromWKT('GEOMETRYCOLLECTION(POLYGON ((-112.840785 33.435962, -112.840785 33.708284, -112.409597 33.708284, -112.409597 33.435962, -112.840785 33.435962)), POLYGON ((-112.309264 33.398167, -112.309264 33.746007, -111.787444 33.746007, -111.787444 33.398167, -112.309264 33.398167)))')) +``` + +Output: + +``` +POINT (-112.04835399999999 33.57208699999999) +``` + +SQL Example: + +``` +SELECT ST_LabelPoint(ST_GeomFromWKT('POLYGON ((-112.654072 33.114485, -112.313516 33.653431, -111.63515 33.314399, -111.497829 33.874913, -111.692825 33.431378, -112.376684 33.788215, -112.654072 33.114485))', 4326)) +``` + +Output: + +``` +SRID=4326;POINT (-112.0722602222832 33.53914975012836) +``` + ## ST_Angle Introduction: Computes and returns the angle between two vectors represented by the provided points or linestrings. @@ -2353,6 +2425,52 @@ Output: 2216860.5497177234 ``` +## ST_Perimeter2D + +Introduction: This function calculates the 2D perimeter of a given geometry. It supports Polygon, MultiPolygon, and GeometryCollection geometries (as long as the GeometryCollection contains polygonal geometries). For other types, it returns 0. To measure lines, use [ST_Length](#st_length). + +To get the perimeter in meters, set `use_spheroid` to `true`. This calculates the geodesic perimeter using the WGS84 spheroid. When using `use_spheroid`, the `lenient` parameter defaults to true, assuming the geometry uses EPSG:4326. To throw an exception instead, set `lenient` to `false`. + +!!!Info + This function is an alias for [ST_Perimeter](#st_perimeter). + +Format: + +`ST_Perimeter2D(geom: Geometry)` + +`ST_Perimeter2D(geom: Geometry, use_spheroid: Boolean)` + +`ST_Perimeter2D(geom: Geometry, use_spheroid: Boolean, lenient: Boolean = True)` + +SQL Example: + +```sql +SELECT ST_Perimeter2D( + ST_GeomFromText('POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))') +) +``` + +Output: + +``` +20.0 +``` + +SQL Example: + +```sql +SELECT ST_Perimeter2D( + ST_GeomFromText('POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))', 4326), + true, false +) +``` + +Output: + +``` +2216860.5497177234 +``` + ## ST_PointN Introduction: Return the Nth point in a single linestring or circular linestring in the geometry. Negative values are counted backwards from the end of the LineString, so that -1 is the last point. Returns NULL if there is no linestring in the geometry. diff --git a/docs/api/sql/Function.md b/docs/api/sql/Function.md index b9bc49c549..26d2ef9dbb 100644 --- a/docs/api/sql/Function.md +++ b/docs/api/sql/Function.md @@ -179,6 +179,80 @@ Input: `POLYGON ((1 0 1, 1 1 1, 2 2 2, 1 0 1))` Output: `POLYGON Z((2 3 1, 4 5 1, 7 8 2, 2 3 1))` +## ST_LabelPoint + +Introduction: `ST_LabelPoint` computes and returns a label point for a given polygon or geometry collection. The label point is chosen to be sufficiently far from boundaries of the geometry. For a regular Polygon this will be the +centroid. + +The algorithm is derived from Tippecanoe’s `polygon_to_anchor`, an approximate solution for label point generation, designed to be faster than optimal algorithms like `polylabel`. It searches for a “good enough” label point within a limited number of iterations. For geometry collections, only the largest Polygon by area is considered. While `ST_Centroid` is a fast algorithm to calculate the center of mass of a (Multi)Polygon, it may place the point outside of the Polygon or near a boundary for concave shapes, polygons with holes, or MultiPolygons. + +`ST_LabelPoint` takes up to 3 arguments, + +- `geometry`: input geometry (e.g., a Polygon or GeometryCollection) for which the anchor point is to be calculated. +- `gridResolution` (Optional, default is 16): Controls the resolution of the search grid for refining the label point. A higher resolution increases the grid density, providing a higher chance of finding a good enough result at the cost of runtime. For example, a gridResolution of 16 divides the bounding box of the polygon into a 16x16 grid. +- `goodnessThreshold` (Optional, default is 0.2): Determines the minimum acceptable “goodness” value for the anchor point. Higher thresholds prioritize points farther from boundaries but may require more computation. + +!!!note + - `ST_LabelPoint` throws an `IllegalArgumentException` if the input geometry has an area of zero or less. + - Holes within polygons are respected. Points within a hole are given a goodness of 0. + - For GeometryCollections, only the largest polygon by area is considered. + +!!!tip + - Use `ST_LabelPoint` for tasks such as label placement, identifying representative points for polygons, or other spatial analyses where an internal reference point is preferred but not required. If intersection of the point and the original geometry is required, use of an algorithm like `polylabel` should be considered. + - `ST_LabelPoint` offers a faster, approximate solution for label point generation, making it ideal for large datasets or real-time applications. + +Format: + +```sql +ST_LabelPoint(geometry: Geometry) +``` + +```sql +ST_LabelPoint(geometry: Geometry, gridResolution: Integer) +``` + +```sql +ST_LabelPoint(geometry: Geometry, gridResolution: Integer, goodnessThreshold: Double) +``` + +Since: `v1.7.1` + +SQL Example: + +``` +SELECT ST_LabelPoint(ST_GeomFromWKT('POLYGON((0 0, 4 0, 4 4, 0 4, 0 0))')) +``` + +Output: + +``` +POINT (2 2) +``` + +SQL Example: + +``` +SELECT ST_LabelPoint(ST_GeomFromWKT('GEOMETRYCOLLECTION(POLYGON ((-112.840785 33.435962, -112.840785 33.708284, -112.409597 33.708284, -112.409597 33.435962, -112.840785 33.435962)), POLYGON ((-112.309264 33.398167, -112.309264 33.746007, -111.787444 33.746007, -111.787444 33.398167, -112.309264 33.398167)))')) +``` + +Output: + +``` +POINT (-112.04835399999999 33.57208699999999) +``` + +SQL Example: + +``` +SELECT ST_LabelPoint(ST_GeomFromWKT('POLYGON ((-112.654072 33.114485, -112.313516 33.653431, -111.63515 33.314399, -111.497829 33.874913, -111.692825 33.431378, -112.376684 33.788215, -112.654072 33.114485))', 4326)) +``` + +Output: + +``` +SRID=4326;POINT (-112.0722602222832 33.53914975012836) +``` + ## ST_Angle Introduction: Computes and returns the angle between two vectors represented by the provided points or linestrings. @@ -634,6 +708,31 @@ Output: 32618 ``` +## ST_BinaryDistanceBandColumn + +Introduction: Introduction: Returns a `weights` column containing every record in a dataframe within a specified `threshold` distance. + +The `weights` column is an array of structs containing the `attributes` from each neighbor and that neighbor's weight. Since this is a binary distance band function, weights of neighbors within the threshold will always be +`1.0`. + +Format: `ST_BinaryDistanceBandColumn(geometry:Geometry, threshold: Double, includeZeroDistanceNeighbors: boolean, includeSelf: boolean, useSpheroid: boolean, attributes: Struct)` + +Since: `v1.7.1` + +SQL Example + +```sql +ST_BinaryDistanceBandColumn(geometry, 1.0, true, true, false, struct(id, geometry)) +``` + +Output: + +```sql +{% raw %} +[{{15, POINT (3 1.9)}, 1.0}, {{16, POINT (3 2)}, 1.0}, {{17, POINT (3 2.1)}, 1.0}, {{18, POINT (3 2.2)}, 1.0}] +{% endraw %} +``` + ## ST_Boundary Introduction: Returns the closure of the combinatorial boundary of this Geometry. @@ -1033,6 +1132,31 @@ true !!!Warning For geometries that span more than 180 degrees in longitude without actually crossing the Date Line, this function may still return true, indicating a crossing. +## ST_DBSCAN + +Introduction: Performs a DBSCAN clustering across the entire dataframe. + +Returns a struct containing the cluster ID and a boolean indicating if the record is a core point in the cluster. + +- `epsilon` is the maximum distance between two points for them to be considered as part of the same cluster. +- `minPoints` is the minimum number of neighbors a single record must have to form a cluster. + +Format: `ST_DBSCAN(geom: Geometry, epsilon: Double, minPoints: Integer)` + +Since: `v1.7.1` + +SQL Example + +```sql +SELECT ST_DBSCAN(geom, 1.0, 2) +``` + +Output: + +``` +{true, 85899345920} +``` + ## ST_Degrees Introduction: Convert an angle in radian to degrees. @@ -1800,6 +1924,31 @@ Output: ST_LINESTRING ``` +## ST_GLocal + +Introduction: Runs Getis and Ord's G Local (Gi or Gi*) statistic on the geometry given the `weights` and `level`. + +Getis and Ord's Gi and Gi* statistics are used to identify data points with locally high values (hot spots) and low +values (cold spots) in a spatial dataset. + +The `ST_WeightedDistanceBand` and `ST_BinaryDistanceBand` functions can be used to generate the `weights` column. + +Format: `ST_GLocal(geom: Geometry, weights: Struct, level: Int)` + +Since: `v1.7.1` + +SQL Example + +```sql +ST_GLocal(myVariable, ST_BinaryDistanceBandColumn(geometry, 1.0, true, true, false, struct(myVariable, geometry)), true) +``` + +Output: + +``` +{0.5238095238095238, 0.4444444444444444, 0.001049802637104223, 2.4494897427831814, 0.00715293921771476} +``` + ## ST_H3CellDistance Introduction: return result of h3 function [gridDistance(cel1, cell2)](https://h3geo.org/docs/api/traversal#griddistance). @@ -2561,6 +2710,47 @@ Output: LINESTRING (-29 -27, -30 -29.7, -45 -33, -46 -32) ``` +## ST_LineSegments + +Introduction: This function transforms a LineString containing multiple coordinates into an array of LineStrings, each with precisely two coordinates. The `lenient` argument, true by default, prevents an exception from being raised if the input geometry is not a LineString. + +Format: + +`ST_LineSegments(geom: Geometry, lenient: Boolean)` + +`ST_LineSegments(geom: Geometry)` + +Since: `v1.7.1` + +SQL Example: + +```sql +SELECT ST_LineSegments( + ST_GeomFromWKT('LINESTRING(0 0, 10 10, 20 20, 30 30, 40 40, 50 50)'), + false + ) +``` + +Output: + +``` +[LINESTRING (0 0, 10 10), LINESTRING (10 10, 20 20), LINESTRING (20 20, 30 30), LINESTRING (30 30, 40 40), LINESTRING (40 40, 50 50)] +``` + +SQL Example: + +```sql +SELECT ST_LineSegments( + ST_GeomFromWKT('POLYGON((0 0, 10 0, 10 10, 0 10, 0 0))') + ) +``` + +Output: + +``` +[] +``` + ## ST_LineSubstring Introduction: Return a linestring being a substring of the input one starting and ending at the given fractions of total 2d length. Second and third arguments are Double values between 0 and 1. This only works with LINESTRINGs. @@ -2583,6 +2773,34 @@ Output: LINESTRING (69.28469348539744 94.28469348539744, 100 125, 111.70035626068274 140.21046313888758) ``` +## ST_LocalOutlierFactor + +Introduction: Computes the Local Outlier Factor (LOF) for each point in the input dataset. + +Local Outlier Factor is an algorithm for determining the degree to which a single record is an inlier or outlier. It is +based on how close a record is to its `k` nearest neighbors vs how close those neighbors are to their `k` nearest +neighbors. Values substantially less than `1` imply that the record is an inlier, while values greater than `1` imply that +the record is an outlier. + +!!!Note + ST_LocalOutlierFactor has a useSphere parameter rather than a useSpheroid parameter. This function thus uses a spherical model of the earth rather than an ellipsoidal model when calculating distance. + +Format: `ST_LocalOutlierFactor(geometry: Geometry, k: Int, useSphere: Boolean)` + +Since: `v1.7.1` + +SQL Example + +```sql +SELECT ST_LocalOutlierFactor(geometry, 5, true) +``` + +Output: + +``` +1.0009256283408587 +``` + ## ST_LocateAlong Introduction: This function computes Point or MultiPoint geometries representing locations along a measured input geometry (LineString or MultiLineString) corresponding to the provided measure value(s). Polygonal geometry inputs are not supported. The output points lie directly on the input line at the specified measure positions. @@ -3202,6 +3420,54 @@ Output: 2216860.5497177234 ``` +## ST_Perimeter2D + +Introduction: This function calculates the 2D perimeter of a given geometry. It supports Polygon, MultiPolygon, and GeometryCollection geometries (as long as the GeometryCollection contains polygonal geometries). For other types, it returns 0. To measure lines, use [ST_Length](#st_length). + +To get the perimeter in meters, set `use_spheroid` to `true`. This calculates the geodesic perimeter using the WGS84 spheroid. When using `use_spheroid`, the `lenient` parameter defaults to true, assuming the geometry uses EPSG:4326. To throw an exception instead, set `lenient` to `false`. + +!!!Info + This function is an alias for [ST_Perimeter](#st_perimeter). + +Format: + +`ST_Perimeter2D(geom: Geometry)` + +`ST_Perimeter2D(geom: Geometry, use_spheroid: Boolean)` + +`ST_Perimeter2D(geom: Geometry, use_spheroid: Boolean, lenient: Boolean = True)` + +Since: `v1.7.1` + +SQL Example: + +```sql +SELECT ST_Perimeter2D( + ST_GeomFromText('POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))') +) +``` + +Output: + +``` +20.0 +``` + +SQL Example: + +```sql +SELECT ST_Perimeter2D( + ST_GeomFromText('POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))', 4326), + true, false +) +``` + +Output: + +``` +2216860.5497177234 +``` + ## ST_PointN Introduction: Return the Nth point in a single linestring or circular linestring in the geometry. Negative values are counted backwards from the end of the LineString, so that -1 is the last point. Returns NULL if there is no linestring in the geometry. @@ -4342,6 +4608,30 @@ Output: GEOMETRYCOLLECTION(POLYGON((-1 2,2 -1,-1 -1,-1 2)),POLYGON((-1 2,2 2,2 -1,-1 2))) ``` +## ST_WeightedDistanceBandColumn + +Introduction: Introduction: Returns a `weights` column containing every record in a dataframe within a specified `threshold` distance. + +The `weights` column is an array of structs containing the `attributes` from each neighbor and that neighbor's weight. Since this is a distance weighted distance band, weights will be distance^alpha. + +Format: `ST_WeightedDistanceBandColumn(geometry:Geometry, threshold: Double, alpha: Double, includeZeroDistanceNeighbors: boolean, includeSelf: boolean, selfWeight: Double, useSpheroid: boolean, attributes: Struct)` + +Since: `v1.7.1` + +SQL Example + +```sql +ST_WeightedDistanceBandColumn(geometry, 1.0, -1.0, true, true, 1.0, false, struct(id, geometry)) +``` + +Output: + +```sql +{% raw %} +[{{15, POINT (3 1.9)}, 1.0}, {{16, POINT (3 2)}, 9.999999999999991}, {{17, POINT (3 2.1)}, 4.999999999999996}, {{18, POINT (3 2.2)}, 3.3333333333333304}] +{% endraw %} +``` + ## ST_X Introduction: Returns X Coordinate of given Point null otherwise. diff --git a/docs/api/sql/Stac.md b/docs/api/sql/Stac.md new file mode 100644 index 0000000000..26ebd082b7 --- /dev/null +++ b/docs/api/sql/Stac.md @@ -0,0 +1,305 @@ + + +The STAC data source allows you to read data from a SpatioTemporal Asset Catalog (STAC) API. The data source supports reading STAC items and collections. + +## Usage + +To use the STAC data source, you can load a STAC catalog into a Sedona DataFrame using the stac format. The path can be either a local STAC collection JSON file or an HTTP/HTTPS endpoint to retrieve the collection JSON file. + +You can load a STAC collection from a local collection file: + +```python +df = sedona.read.format("stac").load("/user/stac_collection.json") +df.printSchema() +df.show() +``` + +You can load a STAC collection from a s3 collection file object: + +```python +df = sedona.read.format("stac").load("s3a://example.com/stac_bucket/stac_collection.json") +df.printSchema() +df.show() +``` + +You can also load a STAC collection from an HTTP/HTTPS endpoint: + +```python +df = sedona.read.format("stac").load("https://earth-search.aws.element84.com/v1/collections/sentinel-2-pre-c1-l2a") +df.printSchema() +df.show() +``` + +output: + +``` +root + |-- stac_version: string (nullable = false) + |-- stac_extensions: array (nullable = true) + | |-- element: string (containsNull = true) + |-- type: string (nullable = false) + |-- id: string (nullable = false) + |-- bbox: array (nullable = true) + | |-- element: double (containsNull = true) + |-- geometry: geometry (nullable = true) + |-- title: string (nullable = true) + |-- description: string (nullable = true) + |-- datetime: timestamp (nullable = true) + |-- start_datetime: timestamp (nullable = true) + |-- end_datetime: timestamp (nullable = true) + |-- created: timestamp (nullable = true) + |-- updated: timestamp (nullable = true) + |-- platform: string (nullable = true) + |-- instruments: array (nullable = true) + | |-- element: string (containsNull = true) + |-- constellation: string (nullable = true) + |-- mission: string (nullable = true) + |-- gsd: double (nullable = true) + |-- collection: string (nullable = true) + |-- links: array (nullable = true) + | |-- element: struct (containsNull = true) + | | |-- rel: string (nullable = true) + | | |-- href: string (nullable = true) + | | |-- type: string (nullable = true) + | | |-- title: string (nullable = true) + |-- assets: map (nullable = true) + | |-- key: string + | |-- value: struct (valueContainsNull = true) + | | |-- href: string (nullable = true) + | | |-- type: string (nullable = true) + | | |-- title: string (nullable = true) + | | |-- roles: array (nullable = true) + | | | |-- element: string (containsNull = true) + ++------------+--------------------+-------+--------------------+--------------------+--------------------+-----+-----------+--------------------+--------------+------------+--------------------+--------------------+-----------+-----------+-------------+-------+----+--------------------+--------------------+--------------------+ +|stac_version| stac_extensions| type| id| bbox| geometry|title|description| datetime|start_datetime|end_datetime| created| updated| platform|instruments|constellation|mission| gsd| collection| links| assets| ++------------+--------------------+-------+--------------------+--------------------+--------------------+-----+-----------+--------------------+--------------+------------+--------------------+--------------------+-----------+-----------+-------------+-------+----+--------------------+--------------------+--------------------+ +| 1.0.0|[https://stac-ext...|Feature|S2B_T21NYC_202212...|[-55.202493, 1.71...|POLYGON ((-55.201...| NULL| NULL|2022-12-05 14:11:...| NULL| NULL|2024-05-01 21:13:...|2024-05-01 21:13:...|sentinel-2b| [msi]| sentinel-2| NULL|NULL|sentinel-2-pre-c1...|[{self, https://e...|{red -> {https://...| +| 1.0.0|[https://stac-ext...|Feature|S2B_T21NZC_202212...|[-54.30394, 1.719...|POLYGON ((-54.302...| NULL| NULL|2022-12-05 14:11:...| NULL| NULL|2024-05-03 00:39:...|2024-05-03 00:39:...|sentinel-2b| [msi]| sentinel-2| NULL|NULL|sentinel-2-pre-c1...|[{self, https://e...|{red -> {https://...| +| 1.0.0|[https://stac-ext...|Feature|S2B_T22NBH_202212...|[-53.698196, 2.63...|POLYGON ((-53.698...| NULL| NULL|2022-12-05 14:11:...| NULL| NULL|2024-05-03 00:26:...|2024-05-03 00:26:...|sentinel-2b| [msi]| sentinel-2| NULL|NULL|sentinel-2-pre-c1...|[{self, https://e...|{red -> {https://...| +| 1.0.0|[https://stac-ext...|Feature|S2B_T21NYD_202212...|[-55.201423, 2.62...|POLYGON ((-55.199...| NULL| NULL|2022-12-05 14:11:...| NULL| NULL|2024-05-01 21:10:...|2024-05-01 21:10:...|sentinel-2b| [msi]| sentinel-2| NULL|NULL|sentinel-2-pre-c1...|[{self, https://e...|{red -> {https://...| +| 1.0.0|[https://stac-ext...|Feature|S2B_T21NZD_202212...|[-54.302336, 2.62...|POLYGON ((-54.299...| NULL| NULL|2022-12-05 14:11:...| NULL| NULL|2024-05-03 00:12:...|2024-05-03 00:12:...|sentinel-2b| [msi]| sentinel-2| NULL|NULL|sentinel-2-pre-c1...|[{self, https://e...|{red -> {https://...| +| 1.0.0|[https://stac-ext...|Feature|S2B_T22NBJ_202212...|[-53.700535, 2.63...|POLYGON ((-53.700...| NULL| NULL|2022-12-05 14:11:...| NULL| NULL|2024-05-03 00:30:...|2024-05-03 00:30:...|sentinel-2b| [msi]| sentinel-2| NULL|NULL|sentinel-2-pre-c1...|[{self, https://e...|{red -> {https://...| +| 1.0.0|[https://stac-ext...|Feature|S2B_T21NYE_202212...|[-55.199906, 3.52...|POLYGON ((-55.197...| NULL| NULL|2022-12-05 14:11:...| NULL| NULL|2024-05-01 21:24:...|2024-05-01 21:24:...|sentinel-2b| [msi]| sentinel-2| NULL|NULL|sentinel-2-pre-c1...|[{self, https://e...|{red -> {https://...| +| 1.0.0|[https://stac-ext...|Feature|S2B_T21NZE_202212...|[-54.300062, 3.52...|POLYGON ((-54.296...| NULL| NULL|2022-12-05 14:11:...| NULL| NULL|2024-05-03 00:14:...|2024-05-03 00:14:...|sentinel-2b| [msi]| sentinel-2| NULL|NULL|sentinel-2-pre-c1...|[{self, https://e...|{red -> {https://...| +| 1.0.0|[https://stac-ext...|Feature|S2B_T22NBK_202212...|[-53.703548, 3.52...|POLYGON ((-53.703...| NULL| NULL|2022-12-05 14:11:...| NULL| NULL|2024-05-03 00:32:...|2024-05-03 00:32:...|sentinel-2b| [msi]| sentinel-2| NULL|NULL|sentinel-2-pre-c1...|[{self, https://e...|{red -> {https://...| +| 1.0.0|[https://stac-ext...|Feature|S2B_T21NYF_202212...|[-55.197941, 4.42...|POLYGON ((-55.195...| NULL| NULL|2022-12-05 14:11:...| NULL| NULL|2024-05-01 21:43:...|2024-05-01 21:43:...|sentinel-2b| [msi]| sentinel-2| NULL|NULL|sentinel-2-pre-c1...|[{self, https://e...|{red -> {https://...| ++------------+--------------------+-------+--------------------+--------------------+--------------------+-----+-----------+--------------------+--------------+------------+--------------------+--------------------+-----------+-----------+-------------+-------+----+--------------------+--------------------+--------------------+ +``` + +# Filter Pushdown + +The STAC data source supports predicate pushdown for spatial and temporal filters. The data source can push down spatial and temporal filters to the underlying data source to reduce the amount of data that needs to be read. + +## Spatial Filter Pushdown + +Spatial filter pushdown allows the data source to apply spatial predicates (e.g., st_contains, st_intersects) directly at the data source level, reducing the amount of data transferred and processed. + +## Temporal Filter Pushdown + +Temporal filter pushdown allows the data source to apply temporal predicates (e.g., BETWEEN, >=, <=) directly at the data source level, similarly reducing the amount of data transferred and processed. + +# Examples + +Here are some examples demonstrating how to query a STAC data source that is loaded into a table named `STAC_TABLE`. + +## SQL Select Without Filters + +```sql +SELECT id, datetime as dt, geometry, bbox FROM STAC_TABLE +``` + +## SQL Select With Temporal Filter + +```sql + SELECT id, datetime as dt, geometry, bbox + FROM STAC_TABLE + WHERE datetime BETWEEN '2020-01-01' AND '2020-12-13' +``` + +In this example, the data source will push down the temporal filter to the underlying data source. + +## SQL Select With Spatial Filter + +```sql + SELECT id, geometry + FROM STAC_TABLE + WHERE st_contains(ST_GeomFromText('POLYGON((17 10, 18 10, 18 11, 17 11, 17 10))'), geometry) +``` + +In this example, the data source will push down the spatial filter to the underlying data source. + +# Python API + +The Python API allows you to interact with a SpatioTemporal Asset Catalog (STAC) API using the Client class. This class provides methods to open a connection to a STAC API, retrieve collections, and search for items with various filters. + +## Client Class + +## Methods + +### `open(url: str) -> Client` + +Opens a connection to the specified STAC API URL. + +**Parameters:** + +- `url` (*str*): The URL of the STAC API to connect to. + **Example:** `"https://planetarycomputer.microsoft.com/api/stac/v1"` + +**Returns:** + +- `Client`: An instance of the `Client` class connected to the specified URL. + +--- + +### `get_collection(collection_id: str) -> CollectionClient` + +Retrieves a collection client for the specified collection ID. + +**Parameters:** + +- `collection_id` (*str*): The ID of the collection to retrieve. + **Example:** `"aster-l1t"` + +**Returns:** + +- `CollectionClient`: An instance of the `CollectionClient` class for the specified collection. + +--- + +### `search(*ids: Union[str, list], collection_id: str, bbox: Optional[list] = None, datetime: Optional[Union[str, datetime.datetime, list]] = None, max_items: Optional[int] = None, return_dataframe: bool = True) -> Union[Iterator[PyStacItem], DataFrame]` + +Searches for items in the specified collection with optional filters. + +**Parameters:** + +- `ids` (*Union[str, list]*): A variable number of item IDs to filter the items. + **Example:** `"item_id1"` or `["item_id1", "item_id2"]` +- `collection_id` (*str*): The ID of the collection to search in. + **Example:** `"aster-l1t"` +- `bbox` (*Optional[list]*): A list of bounding boxes for filtering the items. Each bounding box is represented as a list of four float values: `[min_lon, min_lat, max_lon, max_lat]`. + **Example:** `[[ -180.0, -90.0, 180.0, 90.0 ]]` +- `datetime` (*Optional[Union[str, datetime.datetime, list]]*): A single datetime, RFC 3339-compliant timestamp, or a list of date-time ranges for filtering the items. + **Example:** + - `"2020-01-01T00:00:00Z"` + - `datetime.datetime(2020, 1, 1)` + - `[["2020-01-01T00:00:00Z", "2021-01-01T00:00:00Z"]]` +- `max_items` (*Optional[int]*): The maximum number of items to return from the search, even if there are more matching results. + **Example:** `100` +- `return_dataframe` (*bool*): If `True` (default), return the result as a Spark DataFrame instead of an iterator of `PyStacItem` objects. + **Example:** `True` + +**Returns:** + +- *Union[Iterator[PyStacItem], DataFrame]*: An iterator of `PyStacItem` objects or a Spark DataFrame that matches the specified filters. + +## Sample Code + +### Initialize the Client + +```python +from sedona.stac.client import Client + +# Initialize the client +client = Client.open("https://planetarycomputer.microsoft.com/api/stac/v1") +``` + +### Search Items on a Collection Within a Year + +```python +items = client.search( + collection_id="aster-l1t", + datetime="2020", + return_dataframe=False +) +``` + +### Search Items on a Collection Within a Month and Max Items + +```python +items = client.search( + collection_id="aster-l1t", + datetime="2020-05", + return_dataframe=False, + max_items=5 +) +``` + +### Search Items with Bounding Box and Interval + +```python +items = client.search( + collection_id="aster-l1t", + ids=["AST_L1T_00312272006020322_20150518201805"], + bbox=[-180.0, -90.0, 180.0, 90.0], + datetime=["2006-01-01T00:00:00Z", "2007-01-01T00:00:00Z"], + return_dataframe=False +) +``` + +### Search Multiple Items with Multiple Bounding Boxes + +```python +bbox_list = [ + [-180.0, -90.0, 180.0, 90.0], + [-100.0, -50.0, 100.0, 50.0] +] +items = client.search( + collection_id="aster-l1t", + bbox=bbox_list, + return_dataframe=False +) +``` + +### Search Items and Get DataFrame as Return with Multiple Intervals + +```python +interval_list = [ + ["2020-01-01T00:00:00Z", "2020-06-01T00:00:00Z"], + ["2020-07-01T00:00:00Z", "2021-01-01T00:00:00Z"] +] +df = client.search( + collection_id="aster-l1t", + datetime=interval_list, + return_dataframe=True +) +df.show() +``` + +### Save Items in DataFrame to GeoParquet with Both Bounding Boxes and Intervals + +```python +# Save items in DataFrame to GeoParquet with both bounding boxes and intervals +client.get_collection("aster-l1t").save_to_geoparquet( + output_path="/path/to/output", + bbox=bbox_list, + datetime="2020-05" +) +``` + +These examples demonstrate how to use the Client class to search for items in a STAC collection with various filters and return the results as either an iterator of PyStacItem objects or a Spark DataFrame. + +# References + +- STAC Specification: https://stacspec.org/ + +- STAC Browser: https://github.com/radiantearth/stac-browser + +- STAC YouTube Video: https://www.youtube.com/watch?v=stac-video diff --git a/docs/community/develop.md b/docs/community/develop.md index d9aefa3b14..264d349c85 100644 --- a/docs/community/develop.md +++ b/docs/community/develop.md @@ -23,7 +23,7 @@ ### IDE -We recommend [Intellij IDEA](https://www.jetbrains.com/idea/) with Scala plugin installed. Please make sure that the IDE has JDK 1.8 set as project default. +We recommend [Intellij IDEA](https://www.jetbrains.com/idea/) with Scala plugin installed. Please make sure that the Project has the SDK set to a JDK 1.8. ### Import the project @@ -120,6 +120,18 @@ You can fix this issue by disabling `Use '--release' option for cross-compilatio ![Disable "Use '--release' option for cross-compilation" when using Java 11](../image/ide-java-13.png) +### Run Tests with Different Spark/Scala Versions + +If you want to test changes with different Spark/Scala versions, you can select the Spark and Scala profile in the Maven panel. Once you have selected the desired versions, reload the sedona-parent project. See picture below + +!!!Note + The profile change won't update the module names in the IDE. Don't be misled if a module still has a `-3.3-2.12` suffix in the name. + +!!!Note + Not all combinations of spark and scala versions are supported and so they will fail to compile. + +![Select Spark and Scala Profiles](../image/ide-java-14.png) + ## Python developers ### IDE diff --git a/docs/image/ide-java-14.png b/docs/image/ide-java-14.png new file mode 100644 index 0000000000..b7b4858834 Binary files /dev/null and b/docs/image/ide-java-14.png differ diff --git a/docs/image/tutorial/concepts/dbscan-clustering.png b/docs/image/tutorial/concepts/dbscan-clustering.png new file mode 100644 index 0000000000..8a839ddc01 Binary files /dev/null and b/docs/image/tutorial/concepts/dbscan-clustering.png differ diff --git a/docs/image/tutorial/concepts/dbscan-scatterplot-points.png b/docs/image/tutorial/concepts/dbscan-scatterplot-points.png new file mode 100644 index 0000000000..f5d45f90f4 Binary files /dev/null and b/docs/image/tutorial/concepts/dbscan-scatterplot-points.png differ diff --git a/docs/setup/databricks.md b/docs/setup/databricks.md index 4a970307e0..434a43f012 100644 --- a/docs/setup/databricks.md +++ b/docs/setup/databricks.md @@ -36,8 +36,9 @@ org.datasyslab:geotools-wrapper:{{ sedona.current_geotools }} ``` apache-sedona=={{ sedona.current_version }} -keplergl==0.3.2 -pydeck==0.8.0 +geopandas==1.0.1 +keplergl==0.3.7 +pydeck==0.9.1 ``` ### Initialize @@ -138,9 +139,9 @@ For enabling python support, from the Libraries tab install from PyPI ``` apache-sedona=={{ sedona.current_version }} -geopandas==0.11.1 -keplergl==0.3.2 -pydeck==0.8.0 +geopandas==1.0.1 +keplergl==0.3.7 +pydeck==0.9.1 ``` !!!tips diff --git a/docs/tutorial/concepts/clustering-algorithms.md b/docs/tutorial/concepts/clustering-algorithms.md new file mode 100644 index 0000000000..830b0667ee --- /dev/null +++ b/docs/tutorial/concepts/clustering-algorithms.md @@ -0,0 +1,133 @@ + + +# Apache Sedona Clustering Algorithms + +Clustering algorithms group similar data points into “clusters.” Apache Sedona can run clustering algorithms on large geometric datasets. + +Note that the term cluster is overloaded here: + +* A computation cluster is a network of computers that work together to execute the algorithm +* A clustering algorithm divides data points into different “clusters” + +This page uses “cluster” to refer to the output of a clustering algorithm. + +## Clustering with DBSCAN + +This page explains how to use Apache Sedona to perform density-based spatial clustering of applications with noise (“DBSCAN”). + +This algorithm groups geometric objects in high-density areas as clusters and marks points in low-density areas as outliers. + +Let’s look at a scatter plot of points to visualize a data set that can be clustered. + +![scatter plot of points](../../image/tutorial/concepts/dbscan-scatterplot-points.png) + +Here’s how the DBSCAN algorithm clusters the points: + +![scatter point with cluster groupings](../../image/tutorial/concepts/dbscan-clustering.png) + +* 5 points are in cluster 0 +* 4 points are in cluster 1 +* 4 points are outliers + +Let’s create a Spark DataFrame with this data and then run the clustering with Sedona. Here’s how to construct the DataFrame: + +```python +df = ( + sedona.createDataFrame([ + (1, 8.0, 2.0), + (2, 2.6, 4.0), + (3, 2.5, 4.0), + (4, 8.5, 2.5), + (5, 2.8, 4.3), + (6, 12.8, 4.5), + (7, 2.5, 4.2), + (8, 8.2, 2.5), + (9, 8.0, 3.0), + (10, 1.0, 5.0), + (11, 8.0, 2.5), + (12, 5.0, 6.0), + (13, 4.0, 3.0), + ], ["id", "x", "y"]) +).withColumn("point", ST_Point(col("x"), col("y"))) +``` + +Here are the contents of the DataFrame: + +``` ++---+----+---+----------------+ +| id| x| y| point| ++---+----+---+----------------+ +| 1| 8.0|2.0| POINT (8 2)| +| 2| 2.6|4.0| POINT (2.6 4)| +| 3| 2.5|4.0| POINT (2.5 4)| +| 4| 8.5|2.5| POINT (8.5 2.5)| +| 5| 2.8|4.3| POINT (2.8 4.3)| +| 6|12.8|4.5|POINT (12.8 4.5)| +| 7| 2.5|4.2| POINT (2.5 4.2)| +| 8| 8.2|2.5| POINT (8.2 2.5)| +| 9| 8.0|3.0| POINT (8 3)| +| 10| 1.0|5.0| POINT (1 5)| +| 11| 8.0|2.5| POINT (8 2.5)| +| 12| 5.0|6.0| POINT (5 6)| +| 13| 4.0|3.0| POINT (4 3)| ++---+----+---+----------------+ +``` + +Here’s how to run the DBSCAN algorithm: + +```python +from sedona.stats.clustering.dbscan import dbscan + +dbscan(df, 1.0, 3).orderBy("id").show() +``` + +Here are the results of the computation: + +``` ++---+----+---+----------------+------+-------+ +| id| x| y| point|isCore|cluster| ++---+----+---+----------------+------+-------+ +| 1| 8.0|2.0| POINT (8 2)| true| 0| +| 2| 2.6|4.0| POINT (2.6 4)| true| 1| +| 3| 2.5|4.0| POINT (2.5 4)| true| 1| +| 4| 8.5|2.5| POINT (8.5 2.5)| true| 0| +| 5| 2.8|4.3| POINT (2.8 4.3)| true| 1| +| 6|12.8|4.5|POINT (12.8 4.5)| false| -1| +| 7| 2.5|4.2| POINT (2.5 4.2)| true| 1| +| 8| 8.2|2.5| POINT (8.2 2.5)| true| 0| +| 9| 8.0|3.0| POINT (8 3)| true| 0| +| 10| 1.0|5.0| POINT (1 5)| false| -1| +| 11| 8.0|2.5| POINT (8 2.5)| true| 0| +| 12| 5.0|6.0| POINT (5 6)| false| -1| +| 13| 4.0|3.0| POINT (4 3)| false| -1| ++---+----+---+----------------+------+-------+ +``` + +You can see the `cluster` column that indicates the grouping of the geometric object. + +To run this operation, you must set the Spark checkpoint directory. The checkpoint directory is a temporary cache in durable storage where the query's intermediate results are written. + +Here is how you can set the checkpoint directory: + +```python +sedona.sparkContext.setCheckpointDir(myPath) +``` + +`myPath` needs to be accessible to all executors. A local path is a good option on a local machine. When available, the HDFS is likely the best choice. Some runtime environments may allow or require block storage paths (e.g., Amazon S3, Google Cloud Storage). Depending on your environment, some runtime environments may already set the Spark checkpoint directory, so this step may not be necessary. diff --git a/docs/tutorial/geopandas-shapely.md b/docs/tutorial/geopandas-shapely.md index 2e78442876..bc286229b3 100644 --- a/docs/tutorial/geopandas-shapely.md +++ b/docs/tutorial/geopandas-shapely.md @@ -115,6 +115,26 @@ gdf.plot(

+You may also wish to try converting to GeoPandas via GeoArrow, which can be +significantly faster for large results (requires geopandas >= 1.0). + +```python +import geopandas as gpd +from sedona.spark import dataframe_to_arrow + +config = SedonaContext.builder(). + getOrCreate() + +sedona = SedonaContext.create(config) + +test_wkt = ["POINT (0 1)", "LINESTRING (0 1, 2 3)"] +df = sedona.createDataFrame(zip(test_wkt), ["wkt"]).selectExpr( + "ST_GeomFromText(wkt) as geom" +) + +gpd.GeoDataFrame.from_arrow(dataframe_to_arrow(df)) +``` + ## Interoperate with shapely objects ### Supported Shapely objects diff --git a/docs/tutorial/rdd.md b/docs/tutorial/rdd.md index b36dde62a4..fd5a1fcfde 100644 --- a/docs/tutorial/rdd.md +++ b/docs/tutorial/rdd.md @@ -32,197 +32,31 @@ Please refer to [Create Sedona config](sql.md#create-sedona-config) to create a Please refer to [Initiate SedonaContext](sql.md#initiate-sedonacontext) to initiate a SedonaContext. -## Create a SpatialRDD +## Create a SpatialRDD from SedonaSQL DataFrame -### Create a typed SpatialRDD - -Sedona-core provides three special SpatialRDDs: PointRDD, PolygonRDD, and LineStringRDD. - -!!!warning - Typed SpatialRDD has been deprecated for a long time. We do NOT recommend it anymore. - -### Create a generic SpatialRDD - -A generic SpatialRDD is not typed to a certain geometry type and open to more scenarios. It allows an input data file contains mixed types of geometries. For instance, a WKT file contains three types geometries ==LineString==, ==Polygon== and ==MultiPolygon==. - -#### From WKT/WKB - -Geometries in a WKT and WKB file always occupy a single column no matter how many coordinates they have. Sedona provides `WktReader` and `WkbReader` to create generic SpatialRDD. - -Suppose we have a `checkin.tsv` WKT TSV file at Path `/Download/checkin.tsv` as follows: - -``` -POINT (-88.331492 32.324142) hotel -POINT (-88.175933 32.360763) gas -POINT (-88.388954 32.357073) bar -POINT (-88.221102 32.35078) restaurant -``` - -This file has two columns and corresponding ==offsets==(Column IDs) are 0, 1. Column 0 is the WKT string and Column 1 is the checkin business type. - -Use the following code to create a SpatialRDD +Please refer to [Create a Geometry type column](sql.md#create-a-geometry-type-column) to create a Geometry type column. Then you can create a SpatialRDD from the DataFrame. === "Scala" ```scala - val inputLocation = "/Download/checkin.tsv" - val wktColumn = 0 // The WKT string starts from Column 0 - val allowTopologyInvalidGeometries = true // Optional - val skipSyntaxInvalidGeometries = false // Optional - val spatialRDD = WktReader.readToGeometryRDD(sedona.sparkContext, inputLocation, wktColumn, allowTopologyInvalidGeometries, skipSyntaxInvalidGeometries) + var spatialRDD = StructuredAdapter.toSpatialRdd(spatialDf, "usacounty") ``` === "Java" ```java - String inputLocation = "/Download/checkin.tsv" - int wktColumn = 0 // The WKT string starts from Column 0 - boolean allowTopologyInvalidGeometries = true // Optional - boolean skipSyntaxInvalidGeometries = false // Optional - SpatialRDD spatialRDD = WktReader.readToGeometryRDD(sedona.sparkContext, inputLocation, wktColumn, allowTopologyInvalidGeometries, skipSyntaxInvalidGeometries) + SpatialRDD spatialRDD = StructuredAdapter.toSpatialRdd(spatialDf, "usacounty") ``` === "Python" ```python - from sedona.core.formatMapper import WktReader - from sedona.core.formatMapper import WkbReader - - WktReader.readToGeometryRDD(sc, wkt_geometries_location, 0, True, False) - - WkbReader.readToGeometryRDD(sc, wkb_geometries_location, 0, True, False) - ``` - -#### From GeoJSON - -!!!note - Reading GeoJSON using SpatialRDD is not recommended. Please use [Sedona SQL and DataFrame API](sql.md#load-geojson-data) to read GeoJSON files. - -Geometries in GeoJSON is similar to WKT/WKB. However, a GeoJSON file must be beaked into multiple lines. - -Suppose we have a `polygon.json` GeoJSON file at Path `/Download/polygon.json` as follows: - -``` -{ "type": "Feature", "properties": { "STATEFP": "01", "COUNTYFP": "077", "TRACTCE": "011501", "BLKGRPCE": "5", "AFFGEOID": "1500000US010770115015", "GEOID": "010770115015", "NAME": "5", "LSAD": "BG", "ALAND": 6844991, "AWATER": 32636 }, "geometry": { "type": "Polygon", "coordinates": [ [ [ -87.621765, 34.873444 ], [ -87.617535, 34.873369 ], [ -87.6123, 34.873337 ], [ -87.604049, 34.873303 ], [ -87.604033, 34.872316 ], [ -87.60415, 34.867502 ], [ -87.604218, 34.865687 ], [ -87.604409, 34.858537 ], [ -87.604018, 34.851336 ], [ -87.603716, 34.844829 ], [ -87.603696, 34.844307 ], [ -87.603673, 34.841884 ], [ -87.60372, 34.841003 ], [ -87.603879, 34.838423 ], [ -87.603888, 34.837682 ], [ -87.603889, 34.83763 ], [ -87.613127, 34.833938 ], [ -87.616451, 34.832699 ], [ -87.621041, 34.831431 ], [ -87.621056, 34.831526 ], [ -87.62112, 34.831925 ], [ -87.621603, 34.8352 ], [ -87.62158, 34.836087 ], [ -87.621383, 34.84329 ], [ -87.621359, 34.844438 ], [ -87.62129, 34.846387 ], [ -87.62119, 34.85053 ], [ -87.62144, 34.865379 ], [ -87.621765, 34.873444 ] ] ] } }, -{ "type": "Feature", "properties": { "STATEFP": "01", "COUNTYFP": "045", "TRACTCE": "021102", "BLKGRPCE": "4", "AFFGEOID": "1500000US010450211024", "GEOID": "010450211024", "NAME": "4", "LSAD": "BG", "ALAND": 11360854, "AWATER": 0 }, "geometry": { "type": "Polygon", "coordinates": [ [ [ -85.719017, 31.297901 ], [ -85.715626, 31.305203 ], [ -85.714271, 31.307096 ], [ -85.69999, 31.307552 ], [ -85.697419, 31.307951 ], [ -85.675603, 31.31218 ], [ -85.672733, 31.312876 ], [ -85.672275, 31.311977 ], [ -85.67145, 31.310988 ], [ -85.670622, 31.309524 ], [ -85.670729, 31.307622 ], [ -85.669876, 31.30666 ], [ -85.669796, 31.306224 ], [ -85.670356, 31.306178 ], [ -85.671664, 31.305583 ], [ -85.67177, 31.305299 ], [ -85.671878, 31.302764 ], [ -85.671344, 31.302123 ], [ -85.668276, 31.302076 ], [ -85.66566, 31.30093 ], [ -85.665687, 31.30022 ], [ -85.669183, 31.297677 ], [ -85.668703, 31.295638 ], [ -85.671985, 31.29314 ], [ -85.677177, 31.288211 ], [ -85.678452, 31.286376 ], [ -85.679236, 31.28285 ], [ -85.679195, 31.281426 ], [ -85.676865, 31.281049 ], [ -85.674661, 31.28008 ], [ -85.674377, 31.27935 ], [ -85.675714, 31.276882 ], [ -85.677938, 31.275168 ], [ -85.680348, 31.276814 ], [ -85.684032, 31.278848 ], [ -85.684387, 31.279082 ], [ -85.692398, 31.283499 ], [ -85.705032, 31.289718 ], [ -85.706755, 31.290476 ], [ -85.718102, 31.295204 ], [ -85.719132, 31.29689 ], [ -85.719017, 31.297901 ] ] ] } }, -{ "type": "Feature", "properties": { "STATEFP": "01", "COUNTYFP": "055", "TRACTCE": "001300", "BLKGRPCE": "3", "AFFGEOID": "1500000US010550013003", "GEOID": "010550013003", "NAME": "3", "LSAD": "BG", "ALAND": 1378742, "AWATER": 247387 }, "geometry": { "type": "Polygon", "coordinates": [ [ [ -86.000685, 34.00537 ], [ -85.998837, 34.009768 ], [ -85.998012, 34.010398 ], [ -85.987865, 34.005426 ], [ -85.986656, 34.004552 ], [ -85.985, 34.002659 ], [ -85.98851, 34.001502 ], [ -85.987567, 33.999488 ], [ -85.988666, 33.99913 ], [ -85.992568, 33.999131 ], [ -85.993144, 33.999714 ], [ -85.994876, 33.995153 ], [ -85.998823, 33.989548 ], [ -85.999925, 33.994237 ], [ -86.000616, 34.000028 ], [ -86.000685, 34.00537 ] ] ] } }, -{ "type": "Feature", "properties": { "STATEFP": "01", "COUNTYFP": "089", "TRACTCE": "001700", "BLKGRPCE": "2", "AFFGEOID": "1500000US010890017002", "GEOID": "010890017002", "NAME": "2", "LSAD": "BG", "ALAND": 1040641, "AWATER": 0 }, "geometry": { "type": "Polygon", "coordinates": [ [ [ -86.574172, 34.727375 ], [ -86.562684, 34.727131 ], [ -86.562797, 34.723865 ], [ -86.562957, 34.723168 ], [ -86.562336, 34.719766 ], [ -86.557381, 34.719143 ], [ -86.557352, 34.718322 ], [ -86.559921, 34.717363 ], [ -86.564827, 34.718513 ], [ -86.567582, 34.718565 ], [ -86.570572, 34.718577 ], [ -86.573618, 34.719377 ], [ -86.574172, 34.727375 ] ] ] } }, - -``` - -Use the following code to create a generic SpatialRDD: - -=== "Scala" - - ```scala - val inputLocation = "/Download/polygon.json" - val allowTopologyInvalidGeometries = true // Optional - val skipSyntaxInvalidGeometries = false // Optional - val spatialRDD = GeoJsonReader.readToGeometryRDD(sedona.sparkContext, inputLocation, allowTopologyInvalidGeometries, skipSyntaxInvalidGeometries) - ``` - -=== "Java" - - ```java - String inputLocation = "/Download/polygon.json" - boolean allowTopologyInvalidGeometries = true // Optional - boolean skipSyntaxInvalidGeometries = false // Optional - SpatialRDD spatialRDD = GeoJsonReader.readToGeometryRDD(sedona.sparkContext, inputLocation, allowTopologyInvalidGeometries, skipSyntaxInvalidGeometries) - ``` - -=== "Python" - - ```python - from sedona.core.formatMapper import GeoJsonReader - - GeoJsonReader.readToGeometryRDD(sc, geo_json_file_location) - ``` - -!!!warning - The way that Sedona reads JSON file is different from SparkSQL - -#### From Shapefile - -=== "Scala" - - ```scala - val shapefileInputLocation="/Download/myshapefile" - val spatialRDD = ShapefileReader.readToGeometryRDD(sedona.sparkContext, shapefileInputLocation) - ``` - -=== "Java" - - ```java - String shapefileInputLocation="/Download/myshapefile" - SpatialRDD spatialRDD = ShapefileReader.readToGeometryRDD(sedona.sparkContext, shapefileInputLocation) - ``` - -=== "Python" - - ```python - from sedona.core.formatMapper.shapefileParser import ShapefileReader - - ShapefileReader.readToGeometryRDD(sc, shape_file_location) - ``` - -!!!note - The path to the shapefile is the path to the folder that contains the .shp file, not the path to the .shp file itself. The file extensions of .shp, .shx, .dbf must be in lowercase. Assume you have a shape file called ==myShapefile==, the path should be `XXX/myShapefile`. The file structure should be like this: + from sedona.utils.structured_adapter import StructuredAdapter + spatialRDD = StructuredAdapter.toSpatialRdd(spatialDf, "usacounty") ``` - - shapefile1 - - shapefile2 - - myshapefile - - myshapefile.shp - - myshapefile.shx - - myshapefile.dbf - - myshapefile... - - ... - ``` - -If the file you are reading contains non-ASCII characters you'll need to explicitly set the Spark config before initializing the SparkSession, then you can use `ShapefileReader.readToGeometryRDD`. - -Example: - -```scala -spark.driver.extraJavaOptions -Dsedona.global.charset=utf8 -spark.executor.extraJavaOptions -Dsedona.global.charset=utf8 -``` - -#### From SedonaSQL DataFrame - -!!!note - More details about SedonaSQL, please read the SedonaSQL tutorial. - -To create a generic SpatialRDD from CSV, TSV, WKT, WKB and GeoJSON input formats, you can use SedonaSQL. - -We use checkin.csv CSV file as the example. You can create a generic SpatialRDD using the following steps: - -1. Load data in SedonaSQL. - -```scala -var df = sedona.read.format("csv").option("header", "false").load(csvPointInputLocation) -df.createOrReplaceTempView("inputtable") -``` - -2. Create a Geometry type column in SedonaSQL - -```scala -var spatialDf = sedona.sql( - """ - |SELECT ST_Point(CAST(inputtable._c0 AS Decimal(24,20)),CAST(inputtable._c1 AS Decimal(24,20))) AS checkin - |FROM inputtable - """.stripMargin) -``` -3. Use SedonaSQL DataFrame-RDD Adapter to convert a DataFrame to an SpatialRDD - -```scala -var spatialRDD = Adapter.toSpatialRdd(spatialDf, "checkin") -``` - -"checkin" is the name of the geometry column - -For WKT/WKB data, please use ==ST_GeomFromWKT / ST_GeomFromWKB == instead. +"usacounty" is the name of the geometry column. It is an optional parameter. If you don't provide it, the first geometry column will be used. ## Transform the Coordinate Reference System @@ -284,33 +118,6 @@ To convert Coordinate Reference System of an SpatialRDD, use the following code: The details CRS information can be found on [EPSG.io](https://epsg.io/) -## Read other attributes in an SpatialRDD - -Each SpatialRDD can carry non-spatial attributes such as price, age and name. - -The other attributes are combined together to a string and stored in ==UserData== field of each geometry. - -To retrieve the UserData field, use the following code: - -=== "Scala" - - ```scala - val rddWithOtherAttributes = objectRDD.rawSpatialRDD.rdd.map[String](f=>f.getUserData.asInstanceOf[String]) - ``` - -=== "Java" - - ```java - SpatialRDD spatialRDD = Adapter.toSpatialRdd(spatialDf, "arealandmark"); - spatialRDD.rawSpatialRDD.map(obj -> {return obj.getUserData();}); - ``` - -=== "Python" - - ```python - rdd_with_other_attributes = object_rdd.rawSpatialRDD.map(lambda x: x.getUserData()) - ``` - ## Write a Spatial Range Query A spatial range query takes as input a range query window and an SpatialRDD and returns all geometries that have specified relationship with the query window. @@ -380,7 +187,7 @@ Assume you now have a SpatialRDD (typed or generic). You can use the following c consider_boundary_intersection = False ## Only return gemeotries fully covered by the window using_index = False query_result = RangeQueryRaw.SpatialRangeQuery(spatial_rdd, range_query_window, consider_boundary_intersection, using_index) - gdf = Adapter.toDf(query_result, spark, ["col1", ..., "coln"]) + gdf = StructuredAdapter.toDf(query_result, spark, ["col1", ..., "coln"]) ``` ### Range query window @@ -872,6 +679,7 @@ The index should be built on either one of two SpatialRDDs. In general, you shou from sedona.core.SpatialRDD import CircleRDD from sedona.core.enums import GridType from sedona.core.spatialOperator import JoinQueryRaw + from sedona.utils.structured_adapter import StructuredAdapter object_rdd.analyze() @@ -886,7 +694,7 @@ The index should be built on either one of two SpatialRDDs. In general, you shou result = JoinQueryRaw.DistanceJoinQueryFlat(spatial_rdd, circle_rdd, using_index, consider_boundary_intersection) - gdf = Adapter.toDf(result, ["left_col1", ..., "lefcoln"], ["rightcol1", ..., "rightcol2"], spark) + gdf = StructuredAdapter.toDf(result, ["left_col1", ..., "lefcoln"], ["rightcol1", ..., "rightcol2"], spark) ``` ## Write a Distance Join Query @@ -972,45 +780,10 @@ The output format of the distance join query is [here](#output-format-2). ## Save to permanent storage -You can always save an SpatialRDD back to some permanent storage such as HDFS and Amazon S3. You can save distributed SpatialRDD to WKT, GeoJSON and object files. - -!!!note - Non-spatial attributes such as price, age and name will also be stored to permanent storage. +You can always save an SpatialRDD back to some permanent storage such as HDFS and Amazon S3. ### Save an SpatialRDD (not indexed) -Typed SpatialRDD and generic SpatialRDD can be saved to permanent storage. - -#### Save to distributed WKT text file - -Use the following code to save an SpatialRDD as a distributed WKT text file: - -```scala -objectRDD.rawSpatialRDD.saveAsTextFile("hdfs://PATH") -objectRDD.saveAsWKT("hdfs://PATH") -``` - -#### Save to distributed WKB text file - -Use the following code to save an SpatialRDD as a distributed WKB text file: - -```scala -objectRDD.saveAsWKB("hdfs://PATH") -``` - -#### Save to distributed GeoJSON text file - -!!!note - Saving GeoJSON using SpatialRDD is not recommended. Please use [Sedona SQL and DataFrame API](sql.md#save-as-geojson) to write GeoJSON files. - -Use the following code to save an SpatialRDD as a distributed GeoJSON text file: - -```scala -objectRDD.saveAsGeoJSON("hdfs://PATH") -``` - -#### Save to distributed object file - Use the following code to save an SpatialRDD as a distributed object file: === "Scala/Java" @@ -1032,8 +805,6 @@ Use the following code to save an SpatialRDD as a distributed object file: Indexed typed SpatialRDD and generic SpatialRDD can be saved to permanent storage. However, the indexed SpatialRDD has to be stored as a distributed object file. -#### Save to distributed object file - Use the following code to save an SpatialRDD as a distributed object file: ``` @@ -1046,16 +817,7 @@ A spatial partitioned RDD can be saved to permanent storage but Spark is not abl ### Reload a saved SpatialRDD -You can easily reload an SpatialRDD that has been saved to ==a distributed object file==. - -#### Load to a typed SpatialRDD - -!!!warning - Typed SpatialRDD has been deprecated for a long time. We do NOT recommend it anymore. - -#### Load to a generic SpatialRDD - -Use the following code to reload the SpatialRDD: +You can easily reload an SpatialRDD that has been saved to ==a distributed object file==. Use the following code to reload the SpatialRDD: === "Scala" diff --git a/docs/tutorial/sql.md b/docs/tutorial/sql.md index da2a0f52bb..009a21c87e 100644 --- a/docs/tutorial/sql.md +++ b/docs/tutorial/sql.md @@ -565,10 +565,6 @@ The character encoding of string attributes are inferred from the `.cpg` file. I df = sedona.read.format("shapefile").option("charset", "UTF-8").load("/path/to/shapefile") ``` -### (Deprecated) Loading Shapefile using SpatialRDD - -If you are using Sedona earlier than v`1.7.0`, you can load shapefiles as SpatialRDD and converted to DataFrame using Adapter. Please read [Load SpatialRDD](rdd.md#create-a-generic-spatialrdd) and [DataFrame <-> RDD](#convert-between-dataframe-and-spatialrdd). - ## Load GeoParquet Since v`1.3.0`, Sedona natively supports loading GeoParquet file. Sedona will infer geometry fields using the "geo" metadata in GeoParquet files. @@ -904,6 +900,8 @@ The output will look like this: +----------------+---+------+-------+ ``` +See [this page](../concepts/clustering-algorithms) for more information on the DBSCAN algorithm. + ## Calculate the Local Outlier Factor (LOF) Sedona provides an implementation of the [Local Outlier Factor](https://en.wikipedia.org/wiki/Local_outlier_factor) algorithm to identify anomalous data. @@ -1091,7 +1089,7 @@ SedonaPyDeck exposes APIs to create interactive map visualizations using [pydeck !!!Note To use SedonaPyDeck, install sedona with the `pydeck-map` extra: ``` - pip install sedona[pydeck-map] + pip install apache-sedona[pydeck-map] ``` The following tutorial showcases the various maps that can be created using SedonaPyDeck, the datasets used to create these maps are publicly available. @@ -1168,7 +1166,7 @@ SedonaKepler exposes APIs to create interactive and customizable map visualizati !!!Note To use SedonaKepler, install sedona with the `kepler-map` extra: ``` - pip install sedona[kepler-map] + pip install apache-sedona[kepler-map] ``` This tutorial showcases how simple it is to instantly visualize geospatial data using SedonaKepler. @@ -1592,32 +1590,29 @@ my_postgis_db# alter table my_table alter column geom type geometry; ### DataFrame to SpatialRDD -Use SedonaSQL DataFrame-RDD Adapter to convert a DataFrame to an SpatialRDD. Please read [Adapter Scaladoc](../api/scaladoc/spark/org/apache/sedona/sql/utils/index.html) +Use SedonaSQL DataFrame-RDD Adapter to convert a DataFrame to an SpatialRDD. === "Scala" ```scala - var spatialRDD = Adapter.toSpatialRdd(spatialDf, "usacounty") + var spatialRDD = StructuredAdapter.toSpatialRdd(spatialDf, "usacounty") ``` === "Java" ```java - SpatialRDD spatialRDD = Adapter.toSpatialRdd(spatialDf, "usacounty") + SpatialRDD spatialRDD = StructuredAdapter.toSpatialRdd(spatialDf, "usacounty") ``` === "Python" ```python - from sedona.utils.adapter import Adapter + from sedona.utils.structured_adapter import StructuredAdapter - spatialRDD = Adapter.toSpatialRdd(spatialDf, "usacounty") + spatialRDD = StructuredAdapter.toSpatialRdd(spatialDf, "usacounty") ``` -"usacounty" is the name of the geometry column - -!!!warning - Only one Geometry type column is allowed per DataFrame. +"usacounty" is the name of the geometry column. It is an optional parameter. If you don't provide it, the first geometry column will be used. ### SpatialRDD to DataFrame @@ -1626,109 +1621,81 @@ Use SedonaSQL DataFrame-RDD Adapter to convert a DataFrame to an SpatialRDD. Ple === "Scala" ```scala - var spatialDf = Adapter.toDf(spatialRDD, sedona) + var spatialDf = StructuredAdapter.toDf(spatialRDD, sedona) ``` === "Java" ```java - Dataset spatialDf = Adapter.toDf(spatialRDD, sedona) + Dataset spatialDf = StructuredAdapter.toDf(spatialRDD, sedona) ``` === "Python" ```python - from sedona.utils.adapter import Adapter + from sedona.utils.adapter import StructuredAdapter - spatialDf = Adapter.toDf(spatialRDD, sedona) + spatialDf = StructuredAdapter.toDf(spatialRDD, sedona) ``` -All other attributes such as price and age will be also brought to the DataFrame as long as you specify ==carryOtherAttributes== (see [Read other attributes in an SpatialRDD](rdd.md#read-other-attributes-in-an-spatialrdd)). - -You may also manually specify a schema for the resulting DataFrame in case you require different column names or data -types. Note that string schemas and not all data types are supported—please check the -[Adapter Scaladoc](../api/javadoc/sql/org/apache/sedona/sql/utils/index.html) to confirm what is supported for your use -case. At least one column for the user data must be provided. +### SpatialRDD to DataFrame with spatial partitioning -=== "Scala" - - ```scala - val schema = StructType(Array( - StructField("county", GeometryUDT, nullable = true), - StructField("name", StringType, nullable = true), - StructField("price", DoubleType, nullable = true), - StructField("age", IntegerType, nullable = true) - )) - val spatialDf = Adapter.toDf(spatialRDD, schema, sedona) - ``` - -### SpatialPairRDD to DataFrame +By default, `StructuredAdapter.toDf()` does not preserve spatial partitions because doing so +may introduce duplicate features for most types of spatial data. These duplicates +are introduced on purpose to ensure correctness when performing a spatial join; +however, when using Sedona to prepare a dataset for distribution this is not typically +desired. -PairRDD is the result of a spatial join query or distance join query. SedonaSQL DataFrame-RDD Adapter can convert the result to a DataFrame. But you need to provide the name of other attributes. +You can use `StructuredAdapter` and the `spatialRDD.spatialPartitioningWithoutDuplicates` function to obtain a Sedona DataFrame that is spatially partitioned without duplicates. This is especially useful for generating balanced GeoParquet files while preserving spatial proximity within files, which is crucial for optimizing filter pushdown performance in GeoParquet files. === "Scala" ```scala - var joinResultDf = Adapter.toDf(joinResultPairRDD, Seq("left_attribute1", "left_attribute2"), Seq("right_attribute1", "right_attribute2"), sedona) + spatialRDD.spatialParitioningWithoutDuplicates(GridType.KDBTREE) + // Specify the desired number of partitions as 10, though the actual number may vary + // spatialRDD.spatialParitioningWithoutDuplicates(GridType.KDBTREE, 10) + var spatialDf = StructuredAdapter.toSpatialPartitionedDf(spatialRDD, sedona) ``` === "Java" ```java - import scala.collection.JavaConverters; - - List leftFields = new ArrayList<>(Arrays.asList("c1", "c2", "c3")); - List rightFields = new ArrayList<>(Arrays.asList("c4", "c5", "c6")); - Dataset joinResultDf = Adapter.toDf(joinResultPairRDD, JavaConverters.asScalaBuffer(leftFields).toSeq(), JavaConverters.asScalaBuffer(rightFields).toSeq(), sedona); + spatialRDD.spatialParitioningWithoutDuplicates(GridType.KDBTREE) + // Specify the desired number of partitions as 10, though the actual number may vary + // spatialRDD.spatialParitioningWithoutDuplicates(GridType.KDBTREE, 10) + Dataset spatialDf = StructuredAdapter.toSpatialPartitionedDf(spatialRDD, sedona) ``` === "Python" ```python - from sedona.utils.adapter import Adapter + from sedona.utils.structured_adapter import StructuredAdapter - joinResultDf = Adapter.toDf(jvm_sedona_rdd, ["poi_from_id", "poi_from_name"], ["poi_to_id", "poi_to_name"], spark)) + spatialRDD.spatialPartitioningWithoutDuplicates(GridType.KDBTREE) + # Specify the desired number of partitions as 10, though the actual number may vary + # spatialRDD.spatialParitioningWithoutDuplicates(GridType.KDBTREE, 10) + spatialDf = StructuredAdapter.toSpatialPartitionedDf(spatialRDD, sedona) ``` -or you can use the attribute names directly from the input RDD + +### SpatialPairRDD to DataFrame + +PairRDD is the result of a spatial join query or distance join query. SedonaSQL DataFrame-RDD Adapter can convert the result to a DataFrame. But you need to provide the schema of the left and right RDDs. === "Scala" ```scala - import scala.collection.JavaConversions._ - var joinResultDf = Adapter.toDf(joinResultPairRDD, leftRdd.fieldNames, rightRdd.fieldNames, sedona) + var joinResultDf = StructuredAdapter.toDf(joinResultPairRDD, leftDf.schema, rightDf.schema, sedona) ``` === "Java" ```java - import scala.collection.JavaConverters; - Dataset joinResultDf = Adapter.toDf(joinResultPairRDD, JavaConverters.asScalaBuffer(leftRdd.fieldNames).toSeq(), JavaConverters.asScalaBuffer(rightRdd.fieldNames).toSeq(), sedona); + Dataset joinResultDf = StructuredAdapter.toDf(joinResultPairRDD, leftDf.schema, rightDf.schema, sedona); ``` === "Python" ```python - from sedona.utils.adapter import Adapter - - joinResultDf = Adapter.toDf(result_pair_rdd, leftRdd.fieldNames, rightRdd.fieldNames, spark) - ``` + from sedona.utils.adapter import StructuredAdapter -All other attributes such as price and age will be also brought to the DataFrame as long as you specify ==carryOtherAttributes== (see [Read other attributes in an SpatialRDD](rdd.md#read-other-attributes-in-an-spatialrdd)). - -You may also manually specify a schema for the resulting DataFrame in case you require different column names or data -types. Note that string schemas and not all data types are supported—please check the -[Adapter Scaladoc](../api/javadoc/sql/org/apache/sedona/sql/utils/index.html) to confirm what is supported for your use -case. Columns for the left and right user data must be provided. - -=== "Scala" - - ```scala - val schema = StructType(Array( - StructField("leftGeometry", GeometryUDT, nullable = true), - StructField("name", StringType, nullable = true), - StructField("price", DoubleType, nullable = true), - StructField("age", IntegerType, nullable = true), - StructField("rightGeometry", GeometryUDT, nullable = true), - StructField("category", StringType, nullable = true) - )) - val joinResultDf = Adapter.toDf(joinResultPairRDD, schema, sedona) + joinResultDf = StructuredAdapter.pairRddToDf(result_pair_rdd, leftDf.schema, rightDf.schema, spark) ``` diff --git a/flink/src/main/java/org/apache/sedona/flink/Catalog.java b/flink/src/main/java/org/apache/sedona/flink/Catalog.java index 396ad16cf0..497aaa5fe6 100644 --- a/flink/src/main/java/org/apache/sedona/flink/Catalog.java +++ b/flink/src/main/java/org/apache/sedona/flink/Catalog.java @@ -90,6 +90,7 @@ public static UserDefinedFunction[] getFuncs() { new Functions.ST_GeometryType(), new Functions.ST_InterpolatePoint(), new Functions.ST_Intersection(), + new Functions.ST_LabelPoint(), new Functions.ST_Length(), new Functions.ST_Length2D(), new Functions.ST_LengthSpheroid(), @@ -101,6 +102,7 @@ public static UserDefinedFunction[] getFuncs() { new Functions.ST_FlipCoordinates(), new Functions.ST_GeoHash(), new Functions.ST_Perimeter(), + new Functions.ST_Perimeter2D(), new Functions.ST_PointOnSurface(), new Functions.ST_Scale(), new Functions.ST_ScaleGeom(), @@ -155,6 +157,7 @@ public static UserDefinedFunction[] getFuncs() { new Functions.ST_SetPoint(), new Functions.ST_LineFromMultiPoint(), new Functions.ST_LineMerge(), + new Functions.ST_LineSegments(), new Functions.ST_LineSubstring(), new Functions.ST_HasZ(), new Functions.ST_HasM(), diff --git a/flink/src/main/java/org/apache/sedona/flink/expressions/Functions.java b/flink/src/main/java/org/apache/sedona/flink/expressions/Functions.java index 15789ad25a..86562860fe 100644 --- a/flink/src/main/java/org/apache/sedona/flink/expressions/Functions.java +++ b/flink/src/main/java/org/apache/sedona/flink/expressions/Functions.java @@ -40,6 +40,33 @@ public String eval( } } + public static class ST_LabelPoint extends ScalarFunction { + @DataTypeHint(value = "RAW", bridgedTo = org.locationtech.jts.geom.Geometry.class) + public Geometry eval( + @DataTypeHint(value = "RAW", bridgedTo = org.locationtech.jts.geom.Geometry.class) + Object o) { + Geometry geom = (Geometry) o; + return org.apache.sedona.common.Functions.labelPoint(geom); + } + + @DataTypeHint(value = "RAW", bridgedTo = org.locationtech.jts.geom.Geometry.class) + public Geometry eval( + @DataTypeHint(value = "RAW", bridgedTo = org.locationtech.jts.geom.Geometry.class) Object o, + @DataTypeHint("Integer") Integer gridResolution) { + Geometry geom = (Geometry) o; + return org.apache.sedona.common.Functions.labelPoint(geom, gridResolution); + } + + @DataTypeHint(value = "RAW", bridgedTo = org.locationtech.jts.geom.Geometry.class) + public Geometry eval( + @DataTypeHint(value = "RAW", bridgedTo = org.locationtech.jts.geom.Geometry.class) Object o, + @DataTypeHint("Integer") Integer gridResolution, + @DataTypeHint("Double") Double goodnessThreshold) { + Geometry geom = (Geometry) o; + return org.apache.sedona.common.Functions.labelPoint(geom, gridResolution, goodnessThreshold); + } + } + public static class ST_Area extends ScalarFunction { @DataTypeHint("Double") public Double eval( @@ -595,6 +622,33 @@ public Double eval( } } + public static class ST_Perimeter2D extends ScalarFunction { + @DataTypeHint(value = "Double") + public Double eval( + @DataTypeHint(value = "RAW", bridgedTo = org.locationtech.jts.geom.Geometry.class) + Object o) { + Geometry geom = (Geometry) o; + return org.apache.sedona.common.Functions.perimeter(geom); + } + + @DataTypeHint(value = "Double") + public Double eval( + @DataTypeHint(value = "RAW", bridgedTo = org.locationtech.jts.geom.Geometry.class) Object o, + Boolean use_spheroid) { + Geometry geom = (Geometry) o; + return org.apache.sedona.common.Functions.perimeter(geom, use_spheroid); + } + + @DataTypeHint(value = "Double") + public Double eval( + @DataTypeHint(value = "RAW", bridgedTo = org.locationtech.jts.geom.Geometry.class) Object o, + Boolean use_spheroid, + boolean lenient) { + Geometry geom = (Geometry) o; + return org.apache.sedona.common.Functions.perimeter(geom, use_spheroid, lenient); + } + } + public static class ST_PointOnSurface extends ScalarFunction { @DataTypeHint(value = "RAW", bridgedTo = org.locationtech.jts.geom.Geometry.class) public Geometry eval( @@ -1076,6 +1130,24 @@ public Geometry eval( } } + public static class ST_LineSegments extends ScalarFunction { + @DataTypeHint(value = "RAW", bridgedTo = org.locationtech.jts.geom.Geometry[].class) + public Geometry[] eval( + @DataTypeHint(value = "RAW", bridgedTo = org.locationtech.jts.geom.Geometry.class) + Object o) { + Geometry geometry = (Geometry) o; + return org.apache.sedona.common.Functions.lineSegments(geometry); + } + + @DataTypeHint(value = "RAW", bridgedTo = org.locationtech.jts.geom.Geometry[].class) + public Geometry[] eval( + @DataTypeHint(value = "RAW", bridgedTo = org.locationtech.jts.geom.Geometry.class) Object o, + @DataTypeHint(value = "Boolean") Boolean lenient) { + Geometry geometry = (Geometry) o; + return org.apache.sedona.common.Functions.lineSegments(geometry, lenient); + } + } + public static class ST_LineMerge extends ScalarFunction { @DataTypeHint(value = "RAW", bridgedTo = org.locationtech.jts.geom.Geometry.class) public Geometry eval( diff --git a/flink/src/test/java/org/apache/sedona/flink/FunctionTest.java b/flink/src/test/java/org/apache/sedona/flink/FunctionTest.java index ee1f5a9194..838f39db26 100644 --- a/flink/src/test/java/org/apache/sedona/flink/FunctionTest.java +++ b/flink/src/test/java/org/apache/sedona/flink/FunctionTest.java @@ -48,6 +48,36 @@ public static void onceExecutedBeforeAll() { initialize(); } + @Test + public void testAnchor() { + String actual = + (String) + first( + tableEnv.sqlQuery( + "SELECT ST_AsText(ST_ReducePrecision(ST_LabelPoint(ST_GeomFromWKT('POLYGON ((-112.637484 33.440546, -112.546852 33.477209, -112.489177 33.550488, -112.41777 33.751684, -111.956371 33.719707, -111.766868 33.616843, -111.775107 33.527595, -111.640533 33.504695, -111.440044 33.463462, -111.415326 33.374055, -111.514197 33.309809, -111.643279 33.222542, -111.893203 33.174278, -111.96461 33.250109, -112.123903 33.261593, -112.252985 33.35341, -112.406784 33.346527, -112.667694 33.316695, -112.637484 33.440546))'), 2, 0.2), 4))")) + .getField(0); + String expected = "POINT (-112.0428 33.4642)"; + assertEquals(expected, actual); + + actual = + (String) + first( + tableEnv.sqlQuery( + "SELECT ST_AsText(ST_ReducePrecision(ST_LabelPoint(ST_GeomFromWKT('GEOMETRYCOLLECTION(POLYGON ((-112.840785 33.435962, -112.840785 33.708284, -112.409597 33.708284, -112.409597 33.435962, -112.840785 33.435962)), POLYGON ((-112.309264 33.398167, -112.309264 33.746007, -111.787444 33.746007, -111.787444 33.398167, -112.309264 33.398167)))'), 4), 4))")) + .getField(0); + expected = "POINT (-112.0484 33.5721)"; + assertEquals(expected, actual); + + actual = + (String) + first( + tableEnv.sqlQuery( + "SELECT ST_AsText(ST_ReducePrecision(ST_LabelPoint(ST_GeomFromWKT('POLYGON ((-112.654072 33.114485, -112.313516 33.653431, -111.63515 33.314399, -111.497829 33.874913, -111.692825 33.431378, -112.376684 33.788215, -112.654072 33.114485))')), 4))")) + .getField(0); + expected = "POINT (-112.0723 33.5391)"; + assertEquals(expected, actual); + } + @Test public void testArea() { Table polygonTable = createPolygonTable(1); @@ -791,6 +821,32 @@ public void testPerimeter() { assertEquals(443770.91724830196, perimeter, FP_TOLERANCE); } + @Test + public void testPerimeter2D() { + Table polygonTable = createPolygonTable(testDataSize); + Table perimeterTable = + polygonTable.select( + call(Functions.ST_Perimeter2D.class.getSimpleName(), $(polygonColNames[0]))); + Double perimeter = (Double) first(perimeterTable).getField(0); + assertEquals(4.0, perimeter, FP_TOLERANCE); + + polygonTable = + tableEnv.sqlQuery( + "SELECT ST_GeomFromWKT('POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))', 4326) AS geom"); + perimeterTable = + polygonTable.select( + call(Functions.ST_Perimeter2D.class.getSimpleName(), $("geom"), true, false)); + perimeter = (Double) first(perimeterTable).getField(0); + assertEquals(443770.91724830196, perimeter, FP_TOLERANCE); + + polygonTable = + tableEnv.sqlQuery("SELECT ST_GeomFromWKT('POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))') AS geom"); + perimeterTable = + polygonTable.select(call(Functions.ST_Perimeter2D.class.getSimpleName(), $("geom"), true)); + perimeter = (Double) first(perimeterTable).getField(0); + assertEquals(443770.91724830196, perimeter, FP_TOLERANCE); + } + @Test public void testPointOnSurface() { Table pointTable = createPointTable_real(testDataSize); @@ -1554,6 +1610,30 @@ public void testLineMerge() { assertEquals("LINESTRING (10 160, 60 120, 120 140, 180 120)", result.toString()); } + @Test + public void testLineSegments() { + Table baseTable = + tableEnv.sqlQuery( + "SELECT ST_GeomFromWKT('LINESTRING(120 140, 60 120, 30 20)') AS line, ST_GeomFromWKT('POLYGON ((0 0, 0 1, 1 0, 0 0))') AS poly"); + Geometry[] result = + (Geometry[]) + first( + baseTable.select( + call(Functions.ST_LineSegments.class.getSimpleName(), $("line")))) + .getField(0); + int actualSize = result.length; + int expectedSize = 2; + assertEquals(expectedSize, actualSize); + + result = + (Geometry[]) + first( + baseTable.select( + call(Functions.ST_LineSegments.class.getSimpleName(), $("poly"), true))) + .getField(0); + assertEquals(0, result.length); + } + @Test public void testLineSubString() { Table table = tableEnv.sqlQuery("SELECT ST_GeomFromWKT('LINESTRING (0 0, 2 0)') AS line"); diff --git a/mkdocs.yml b/mkdocs.yml index f04833ee88..eed5a32ab5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -16,6 +16,7 @@ # under the License. site_name: Apache Sedona™ +site_url: https://sedona.apache.org site_description: Apache Sedona™ is a cluster computing system for processing large-scale spatial data. Sedona extends existing cluster computing systems, such as Apache Spark, Apache Flink, and Snowflake, with a set of out-of-the-box distributed Spatial Datasets and Spatial SQL that efficiently load, process, and analyze large-scale spatial data across machines. nav: - Home: index.md @@ -74,6 +75,8 @@ nav: - Examples: - Scala/Java: tutorial/demo.md - Python: tutorial/jupyter-notebook.md + - Concepts: + - Clustering Algorithms: tutorial/concepts/clustering-algorithms.md - API Docs: - Sedona with Apache Spark: - SQL: @@ -87,6 +90,7 @@ nav: - Query optimization: api/sql/Optimizer.md - Nearest-Neighbour searching: api/sql/NearestNeighbourSearching.md - "Spider:Spatial Data Generator": api/sql/Spider.md + - Reading STAC Data Source: api/sql/Stac.md - Reading Legacy Parquet Files: api/sql/Reading-legacy-parquet.md - Visualization: - SedonaPyDeck: api/sql/Visualization_SedonaPyDeck.md @@ -150,6 +154,7 @@ nav: - Telemetry: asf/telemetry.md repo_url: https://github.com/apache/sedona repo_name: apache/sedona +edit_uri: edit/master/docs/ theme: font: false name: 'material' @@ -164,16 +169,20 @@ theme: repo: fontawesome/brands/github features: - content.code.copy + - content.action.edit - search.highlight - search.share - search.suggest - navigation.footer - navigation.instant - navigation.tabs + - navigation.tabs.sticky - navigation.top extra: version: provider: mike + default: + - latest social: - icon: fontawesome/brands/github-alt link: 'https://github.com/apache/sedona' @@ -225,3 +234,5 @@ plugins: type: datetime - mkdocs-jupyter: include_source: True + - mike: + canonical_version: "latest" diff --git a/python/Pipfile b/python/Pipfile index 8c899b263e..389c56c0ac 100644 --- a/python/Pipfile +++ b/python/Pipfile @@ -29,6 +29,7 @@ attrs="*" pyarrow="*" keplergl = "==0.3.2" pydeck = "===0.8.0" +pystac = "===1.5.0" rasterio = ">=1.2.10" [requires] diff --git a/python/sedona/core/SpatialRDD/spatial_rdd.py b/python/sedona/core/SpatialRDD/spatial_rdd.py index a373309d8d..266676c0d5 100644 --- a/python/sedona/core/SpatialRDD/spatial_rdd.py +++ b/python/sedona/core/SpatialRDD/spatial_rdd.py @@ -19,7 +19,7 @@ from typing import List, Optional, Union import attr -from py4j.java_gateway import get_field +from py4j.java_gateway import get_field, get_method from pyspark import RDD, SparkContext, StorageLevel from pyspark.sql import SparkSession @@ -51,6 +51,15 @@ def from_java_class_name(cls, jvm_partitioner) -> "SpatialPartitioner": return cls(partitioner, jvm_partitioner) + def getGrids(self) -> List[Envelope]: + jvm_grids = get_method(self.jvm_partitioner, "getGrids")() + number_of_grids = jvm_grids.size() + envelopes = [ + Envelope.from_jvm_instance(jvm_grids[index]) + for index in range(number_of_grids) + ] + return envelopes + @attr.s class JvmSpatialRDD: @@ -422,11 +431,49 @@ def spatialPartitioning( num_partitions: Optional[int] = None, ) -> bool: """ + Calculate partitions and assign items in this RDD to a partition. - :param partitioning: partitioning type - :param num_partitions: number of partitions - :return: + :param partitioning: Partitioning type or existing SpatialPartitioner + (e.g., one obtained from another SpatialRDD to align partitions among + input data) + :param num_partitions: If partitioning is a GridType, the target + number of partitions into which the RDD should be split. + :return: True on success + """ + return self._spatial_partitioning_impl( + partitioning, num_partitions, self._srdd.spatialPartitioning + ) + + def spatialPartitioningWithoutDuplicates( + self, + partitioning: Union[str, GridType, SpatialPartitioner, List[Envelope]], + num_partitions: Optional[int] = None, + ) -> bool: """ + Calculate partitions and assign items in this RDD to a partition without + introducing duplicates. This is not the desired behaviour for + executing joins but is the correct option when partitioning in + preparation for a distributed write. + + :param partitioning: Partitioning type or existing SpatialPartitioner + (e.g., one obtained from another SpatialRDD to align partitions among + input data) + :param num_partitions: If partitioning is a GridType, the target + number of partitions into which the RDD should be split. + :return: True on success + """ + return self._spatial_partitioning_impl( + partitioning, + num_partitions, + self._srdd.spatialPartitioningWithoutDuplicates, + ) + + def _spatial_partitioning_impl( + self, + partitioning: Union[str, GridType, SpatialPartitioner, List[Envelope]], + num_partitions: Optional[int], + java_method, + ) -> bool: if type(partitioning) == str: grid = GridTypeJvm(self._jvm, GridType.from_str(partitioning)).jvm_instance elif type(partitioning) == GridType: @@ -446,9 +493,9 @@ def spatialPartitioning( self._spatial_partitioned = True if num_partitions: - return self._srdd.spatialPartitioning(grid, num_partitions) + return java_method(grid, num_partitions) else: - return self._srdd.spatialPartitioning(grid) + return java_method(grid) def set_srdd(self, srdd): self._srdd = srdd diff --git a/python/sedona/maps/SedonaKepler.py b/python/sedona/maps/SedonaKepler.py index 0b3530826e..6822cb28ad 100644 --- a/python/sedona/maps/SedonaKepler.py +++ b/python/sedona/maps/SedonaKepler.py @@ -34,7 +34,7 @@ def create_map(cls, df=None, name="unnamed", config=None): try: from keplergl import KeplerGl except ImportError: - msg = "Install sedona[kepler-map] to convert sedona dataframes to kepler maps." + msg = "Install apache-sedona[kepler-map] to convert sedona dataframes to kepler maps." raise ImportError(msg) from None kepler_map = KeplerGl() diff --git a/python/sedona/maps/SedonaMapUtils.py b/python/sedona/maps/SedonaMapUtils.py index 02ef909212..ddca721c45 100644 --- a/python/sedona/maps/SedonaMapUtils.py +++ b/python/sedona/maps/SedonaMapUtils.py @@ -18,6 +18,8 @@ import json from sedona.sql.types import GeometryType +from sedona.utils.geoarrow import dataframe_to_arrow +from packaging.version import parse class SedonaMapUtils: @@ -34,17 +36,25 @@ def __convert_to_gdf_or_pdf__(cls, df, rename=True, geometry_col=None): """ if geometry_col is None: geometry_col = SedonaMapUtils.__get_geometry_col__(df) - pandas_df = df.toPandas() + + # Convert the dataframe to arrow format, then to geopandas dataframe + # This is faster than converting directly to geopandas dataframe via toPandas if ( geometry_col is None ): # No geometry column found even after searching schema, return Pandas Dataframe - return pandas_df + data_pyarrow = dataframe_to_arrow(df) + return data_pyarrow.to_pandas() try: import geopandas as gpd except ImportError: - msg = "GeoPandas is missing. You can install it manually or via sedona[kepler-map] or sedona[pydeck-map]." + msg = "GeoPandas is missing. You can install it manually or via apache-sedona[kepler-map] or apache-sedona[pydeck-map]." raise ImportError(msg) from None - geo_df = gpd.GeoDataFrame(pandas_df, geometry=geometry_col) + # From GeoPandas 1.0.0 onwards, the from_arrow method is available + if parse(gpd.__version__) >= parse("1.0.0"): + data_pyarrow = dataframe_to_arrow(df) + geo_df = gpd.GeoDataFrame.from_arrow(data_pyarrow) + else: + geo_df = gpd.GeoDataFrame(df.toPandas(), geometry=geometry_col) if geometry_col != "geometry" and rename is True: geo_df.rename_geometry("geometry", inplace=True) return geo_df diff --git a/python/sedona/maps/SedonaPyDeck.py b/python/sedona/maps/SedonaPyDeck.py index a15cafca59..343389841b 100644 --- a/python/sedona/maps/SedonaPyDeck.py +++ b/python/sedona/maps/SedonaPyDeck.py @@ -385,7 +385,7 @@ def _try_import_pydeck() -> ModuleType: import pydeck as pdk except ImportError: - msg = "Install sedona[pydeck-map] to convert sedona dataframes to pydeck maps." + msg = "Install apache-sedona[pydeck-map] to convert sedona dataframes to pydeck maps." raise ImportError(msg) from None return pdk diff --git a/python/sedona/raster_utils/SedonaUtils.py b/python/sedona/raster_utils/SedonaUtils.py index 5f7304f3ff..d35fcd6210 100644 --- a/python/sedona/raster_utils/SedonaUtils.py +++ b/python/sedona/raster_utils/SedonaUtils.py @@ -15,10 +15,13 @@ # specific language governing permissions and limitations # under the License. +from sedona.maps.SedonaMapUtils import SedonaMapUtils + class SedonaUtils: @classmethod def display_image(cls, df): from IPython.display import HTML, display - display(HTML(df.toPandas().to_html(escape=False))) + pdf = SedonaMapUtils.__convert_to_gdf_or_pdf__(df, rename=False) + display(HTML(pdf.to_html(escape=False))) diff --git a/python/sedona/register/java_libs.py b/python/sedona/register/java_libs.py index 931488d917..8d1681d15e 100644 --- a/python/sedona/register/java_libs.py +++ b/python/sedona/register/java_libs.py @@ -21,6 +21,7 @@ class SedonaJvmLib(Enum): JoinParams = "org.apache.sedona.python.wrapper.adapters.JoinParamsAdapter" Adapter = "org.apache.sedona.sql.utils.Adapter" + StructuredAdapter = "org.apache.spark.sql.sedona_sql.adapters.StructuredAdapter" JoinQuery = "org.apache.sedona.core.spatialOperator.JoinQuery" KNNQuery = "org.apache.sedona.core.spatialOperator.KNNQuery" RangeQuery = "org.apache.sedona.core.spatialOperator.RangeQuery" diff --git a/python/sedona/spark/__init__.py b/python/sedona/spark/__init__.py index 6657579538..50d1d1131e 100644 --- a/python/sedona/spark/__init__.py +++ b/python/sedona/spark/__init__.py @@ -45,3 +45,4 @@ from sedona.sql.types import GeometryType, RasterType from sedona.utils import KryoSerializer, SedonaKryoRegistrator from sedona.utils.adapter import Adapter +from sedona.utils.geoarrow import dataframe_to_arrow diff --git a/python/sedona/sql/dataframe_api.py b/python/sedona/sql/dataframe_api.py index 2f56dfffa5..b1639a97bf 100644 --- a/python/sedona/sql/dataframe_api.py +++ b/python/sedona/sql/dataframe_api.py @@ -21,6 +21,7 @@ import typing from typing import Any, Callable, Iterable, List, Mapping, Tuple, Type, Union +from pyspark import SparkContext from pyspark.sql import Column, SparkSession from pyspark.sql import functions as f @@ -57,12 +58,6 @@ def _convert_argument_to_java_column(arg: Any) -> Column: def call_sedona_function( object_name: str, function_name: str, args: Union[Any, Tuple[Any]] ) -> Column: - spark = SparkSession.getActiveSession() - if spark is None: - raise ValueError( - "No active spark session was detected. Unable to call sedona function." - ) - # apparently a Column is an Iterable so we need to check for it explicitly if (not isinstance(args, Iterable)) or isinstance( args, (str, Column, ConnectColumn) @@ -75,7 +70,12 @@ def call_sedona_function( args = map(_convert_argument_to_java_column, args) - jobject = getattr(spark._jvm, object_name) + jvm = SparkContext._jvm + if jvm is None: + raise ValueError( + "No active spark context was detected. Unable to call sedona function." + ) + jobject = getattr(jvm, object_name) jfunc = getattr(jobject, function_name) jc = jfunc(*args) diff --git a/python/sedona/sql/st_functions.py b/python/sedona/sql/st_functions.py index 9e77909c15..684ffa8e93 100644 --- a/python/sedona/sql/st_functions.py +++ b/python/sedona/sql/st_functions.py @@ -20,6 +20,7 @@ from typing import Optional, Union from pyspark.sql import Column +from pyspark.sql.functions import lit from sedona.sql.dataframe_api import ( ColumnOrName, @@ -100,6 +101,36 @@ def ST_AddPoint( return _call_st_function("ST_AddPoint", args) +@validate_argument_types +def ST_LabelPoint( + geometry: ColumnOrName, + gridResolution: Optional[Union[ColumnOrNameOrNumber, int]] = None, + goodnessThreshold: Optional[Union[ColumnOrNameOrNumber, float]] = None, +) -> Column: + """Calculate an anchor point for a given geometry column. + + :param geometry: Input geometry column to calculate the anchor for. + :type geometry: ColumnOrName + :param gridResolution: Optional step size for grid search when determining the best anchor point. + Defaults to 2 if not provided. + :type gridResolution: Optional[Union[ColumnOrNameOrNumber, int]], optional + :param goodnessThreshold: Optional threshold for the minimum "goodness" value. + Determines when to stop refining the anchor search. + Defaults to 0.2 if not provided. + :type goodnessThreshold: Optional[Union[ColumnOrNameOrNumber, float]], optional + :return: Anchor point as a geometry column. + :rtype: Column + """ + if gridResolution is None and goodnessThreshold is None: + args = (geometry,) + elif goodnessThreshold is None: + args = (geometry, gridResolution) + else: + args = (geometry, gridResolution, goodnessThreshold) + + return _call_st_function("ST_LabelPoint", args) + + @validate_argument_types def ST_Area(geometry: ColumnOrName) -> Column: """Calculate the area of a geometry. @@ -1038,6 +1069,24 @@ def ST_LineMerge(multi_line_string: ColumnOrName) -> Column: return _call_st_function("ST_LineMerge", multi_line_string) +@validate_argument_types +def ST_LineSegments( + geom: ColumnOrName, lenient: Optional[Union[ColumnOrName, bool]] = None +) -> Column: + """ + Convert multi-coordinate LineString into an array of LineStrings that contain exactly 2 points. + + @param geom: input LineString geometry column. + @param lenient: suppresses exception + @return: array of LineStrings + """ + args = (geom, lenient) + if lenient is None: + args = (geom,) + + return _call_st_function("ST_LineSegments", args) + + @validate_argument_types def ST_LineSubstring( line_string: ColumnOrName, @@ -1194,6 +1243,30 @@ def ST_Perimeter( return _call_st_function("ST_Perimeter", args) +@validate_argument_types +def ST_Perimeter2D( + geom: ColumnOrName, + use_spheroid: Optional[Union[ColumnOrName, bool]] = None, + lenient: Optional[Union[ColumnOrName, bool]] = None, +) -> Column: + """Returns the perimeter of a Polygon/MultiPolygon geometries. Otherwise, returns 0 + + @param geom: Polygonal geometry + @param use_spheroid: Use Spheroid + @param lenient: suppresses the exception + @return: Perimeter of a Polygon/MultiPolygon geometries + """ + + args = (geom, use_spheroid, lenient) + + if lenient is None: + if use_spheroid is None: + args = (geom,) + else: + args = (geom, use_spheroid) + return _call_st_function("ST_Perimeter2D", args) + + @validate_argument_types def ST_Points(geometry: ColumnOrName) -> Column: """Creates a MultiPoint geometry consisting of all the coordinates of the input geometry @@ -2432,6 +2505,188 @@ def ST_InterpolatePoint(geom1: ColumnOrName, geom2: ColumnOrName) -> Column: return _call_st_function("ST_InterpolatePoint", args) +@validate_argument_types +def ST_DBSCAN( + geometry: ColumnOrName, + epsilon: Union[ColumnOrName, float], + min_pts: Union[ColumnOrName, int], + use_spheroid: Optional[Union[ColumnOrName, bool]] = False, +) -> Column: + """Perform DBSCAN clustering on the given geometry column. + + @param geometry: Geometry column or name + :type geometry: ColumnOrName + @param epsilon: the distance between two points to be considered neighbors + :type epsilon: ColumnOrName + @param min_pts: the number of neighbors a point should have to form a cluster + :type min_pts: ColumnOrName + @param use_spheroid: whether to use spheroid for distance calculation + :type use_spheroid: ColumnOrName + @return: A struct indicating the cluster to which the point belongs and whether it is a core point + """ + + if isinstance(epsilon, float): + epsilon = lit(epsilon) + + if isinstance(min_pts, int): + min_pts = lit(min_pts) + + if isinstance(use_spheroid, bool): + use_spheroid = lit(use_spheroid) + + return _call_st_function("ST_DBSCAN", (geometry, epsilon, min_pts, use_spheroid)) + + +@validate_argument_types +def ST_LocalOutlierFactor( + geometry: ColumnOrName, + k: Union[ColumnOrName, int], + use_spheroid: Optional[Union[ColumnOrName, bool]] = False, +) -> Column: + """Calculate the local outlier factor on the given geometry column. + + @param geometry: Geometry column or name + :type geometry: ColumnOrName + @param k: the number of neighbors to use for LOF calculation + :type k: ColumnOrName + @param use_spheroid: whether to use spheroid for distance calculation + :type use_spheroid: ColumnOrName + @return: A Double indicating the local outlier factor of the point + """ + + if isinstance(k, int): + k = lit(k) + + if isinstance(use_spheroid, bool): + use_spheroid = lit(use_spheroid) + + return _call_st_function("ST_LocalOutlierFactor", (geometry, k, use_spheroid)) + + +@validate_argument_types +def ST_GLocal( + x: ColumnOrName, + weights: ColumnOrName, + star: Optional[Union[ColumnOrName, bool]] = False, +) -> Column: + """Calculate Getis Ord Gi(*) statistics on the given column. + + @param x: The variable we want to compute Gi statistics for + :type x: ColumnOrName + @param weights: the weights array containing the neighbors, their weights, and their values of x + :type weights: ColumnOrName + @param star: whether to use the focal observation in the calculations + :type star: ColumnOrName + @return: A struct containing the Gi statistics including a p value + """ + + if isinstance(star, bool): + star = lit(star) + + return _call_st_function("ST_GLocal", (x, weights, star)) + + +@validate_argument_types +def ST_BinaryDistanceBandColumn( + geometry: ColumnOrName, + threshold: ColumnOrName, + include_zero_distance_neighbors: Union[ColumnOrName, bool] = True, + include_self: Union[ColumnOrName, bool] = False, + use_spheroid: Union[ColumnOrName, bool] = False, + attributes: ColumnOrName = None, +) -> Column: + """Creates a weights column containing the other records within the threshold and their weight. + + Weights will always be 1.0. + + + @param geometry: name of the geometry column + @param threshold: Distance threshold for considering neighbors + @param include_zero_distance_neighbors: whether to include neighbors that are 0 distance. + @param include_self: whether to include self in the list of neighbors + @param use_spheroid: whether to use a cartesian or spheroidal distance calculation. Default is false + @param attributes: the attributes to save in the neighbor column. + + """ + if isinstance(include_zero_distance_neighbors, bool): + include_zero_distance_neighbors = lit(include_zero_distance_neighbors) + + if isinstance(include_self, bool): + include_self = lit(include_self) + + if isinstance(use_spheroid, bool): + use_spheroid = lit(use_spheroid) + + return _call_st_function( + "ST_BinaryDistanceBandColumn", + ( + geometry, + threshold, + include_zero_distance_neighbors, + include_self, + use_spheroid, + attributes, + ), + ) + + +@validate_argument_types +def ST_WeightedDistanceBandColumn( + geometry: ColumnOrName, + threshold: ColumnOrName, + alpha: Union[ColumnOrName, float], + include_zero_distance_neighbors: Union[ColumnOrName, bool] = True, + include_self: Union[ColumnOrName, bool] = False, + self_weight: Union[ColumnOrName, float] = 1.0, + use_spheroid: Union[ColumnOrName, bool] = False, + attributes: ColumnOrName = None, +) -> Column: + """Creates a weights column containing the other records within the threshold and their weight. + + Weights will be distance^alpha. + + + @param geometry: name of the geometry column + @param threshold: Distance threshold for considering neighbors + @param alpha: alpha to use for inverse distance weights. Computation is dist^alpha. Default is -1.0 + @param include_zero_distance_neighbors: whether to include neighbors that are 0 distance. If 0 distance neighbors are + included, values are infinity as per the floating point spec (divide by 0) + @param include_self: whether to include self in the list of neighbors + @param self_weight: the value to use for the self weight. Default is 1.0 + @param use_spheroid: whether to use a cartesian or spheroidal distance calculation. Default is false + @param attributes: the attributes to save in the neighbor column. + + """ + if isinstance(alpha, float): + alpha = lit(alpha) + + if isinstance(include_zero_distance_neighbors, bool): + include_zero_distance_neighbors = lit(include_zero_distance_neighbors) + + if isinstance(include_self, bool): + include_self = lit(include_self) + + if isinstance(self_weight, float): + self_weight = lit(self_weight) + + if isinstance(use_spheroid, bool): + use_spheroid = lit(use_spheroid) + + return _call_st_function( + "ST_WeightedDistanceBandColumn", + ( + geometry, + threshold, + alpha, + include_zero_distance_neighbors, + include_self, + self_weight, + use_spheroid, + attributes, + ), + ) + + # Automatically populate __all__ __all__ = [ name diff --git a/python/sedona/stac/__init__.py b/python/sedona/stac/__init__.py new file mode 100644 index 0000000000..a67d5ea255 --- /dev/null +++ b/python/sedona/stac/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/sedona/stac/client.py b/python/sedona/stac/client.py new file mode 100644 index 0000000000..3e8eeacefa --- /dev/null +++ b/python/sedona/stac/client.py @@ -0,0 +1,112 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from typing import Union, Optional, Iterator + +from sedona.stac.collection_client import CollectionClient + +import datetime as python_datetime +from pystac import Item as PyStacItem + +from pyspark.sql import DataFrame + + +class Client: + def __init__(self, url: str): + self.url = url + + @classmethod + def open(cls, url: str): + """ + Opens a connection to the specified STAC API URL. + + This class method creates an instance of the Client class with the given URL. + + Parameters: + - url (str): The URL of the STAC API to connect to. + Example: "https://planetarycomputer.microsoft.com/api/stac/v1" + + Returns: + - Client: An instance of the Client class connected to the specified URL. + """ + return cls(url) + + def get_collection(self, collection_id: str): + """ + Retrieves a collection client for the specified collection ID. + + This method creates an instance of the CollectionClient class for the given collection ID, + allowing interaction with the specified collection in the STAC API. + + Parameters: + - collection_id (str): The ID of the collection to retrieve. + Example: "aster-l1t" + + Returns: + - CollectionClient: An instance of the CollectionClient class for the specified collection. + """ + return CollectionClient(self.url, collection_id) + + def search( + self, + *ids: Union[str, list], + collection_id: str, + bbox: Optional[list] = None, + datetime: Optional[Union[str, python_datetime.datetime, list]] = None, + max_items: Optional[int] = None, + return_dataframe: bool = True, + ) -> Union[Iterator[PyStacItem], DataFrame]: + """ + Searches for items in the specified collection with optional filters. + + Parameters: + - ids (Union[str, list]): A variable number of item IDs to filter the items. + Example: "item_id1" or ["item_id1", "item_id2"] + + - collection_id (str): The ID of the collection to search in. + Example: "aster-l1t" + + - bbox (Optional[list]): A list of bounding boxes for filtering the items. + Each bounding box is represented as a list of four float values: [min_lon, min_lat, max_lon, max_lat]. + Example: [[-180.0, -90.0, 180.0, 90.0]] # This bounding box covers the entire world. + + - datetime (Optional[Union[str, python_datetime.datetime, list]]): A single datetime, RFC 3339-compliant timestamp, + or a list of date-time ranges for filtering the items. The datetime can be specified in various formats: + - "YYYY" expands to ["YYYY-01-01T00:00:00Z", "YYYY-12-31T23:59:59Z"] + - "YYYY-mm" expands to ["YYYY-mm-01T00:00:00Z", "YYYY-mm-T23:59:59Z"] + - "YYYY-mm-dd" expands to ["YYYY-mm-ddT00:00:00Z", "YYYY-mm-ddT23:59:59Z"] + - "YYYY-mm-ddTHH:MM:SSZ" remains as ["YYYY-mm-ddTHH:MM:SSZ", "YYYY-mm-ddTHH:MM:SSZ"] + - A list of date-time ranges can be provided for multiple intervals. + Example: "2020-01-01T00:00:00Z" or python_datetime.datetime(2020, 1, 1) or [["2020-01-01T00:00:00Z", "2021-01-01T00:00:00Z"]] + + - max_items (Optional[int]): The maximum number of items to return from the search, even if there are more matching results. + Example: 100 + + - return_dataframe (bool): If True, return the result as a Spark DataFrame instead of an iterator of PyStacItem objects. + Example: True + + Returns: + - Union[Iterator[PyStacItem], DataFrame]: An iterator of PyStacItem objects or a Spark DataFrame that match the specified filters. + """ + client = self.get_collection(collection_id) + if return_dataframe: + return client.get_dataframe( + *ids, bbox=bbox, datetime=datetime, max_items=max_items + ) + else: + return client.get_items( + *ids, bbox=bbox, datetime=datetime, max_items=max_items + ) diff --git a/python/sedona/stac/collection_client.py b/python/sedona/stac/collection_client.py new file mode 100644 index 0000000000..b1cae6df39 --- /dev/null +++ b/python/sedona/stac/collection_client.py @@ -0,0 +1,398 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import logging +from typing import Iterator, Union +from typing import Optional + +import datetime as python_datetime +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql.types import dt +from pystac import Item as PyStacItem + + +def get_collection_url(url: str, collection_id: Optional[str] = None) -> str: + """ + Constructs the collection URL based on the provided base URL and optional collection ID. + + If the collection ID is provided and the URL starts with 'http' or 'https', the collection ID + is appended to the URL. Otherwise, an exception is raised. + + Parameters: + - url (str): The base URL of the STAC collection. + - collection_id (Optional[str]): The optional collection ID to append to the URL. + + Returns: + - str: The constructed collection URL. + + Raises: + - ValueError: If the URL does not start with 'http' or 'https' and a collection ID is provided. + """ + if not collection_id: + return url + elif url.startswith("http") or url.startswith("https"): + return f"{url}/collections/{collection_id}" + else: + raise ValueError( + "Collection ID is not used because the URL does not start with http or https" + ) + + +class CollectionClient: + def __init__(self, url: str, collection_id: Optional[str] = None): + self.url = url + self.collection_id = collection_id + self.collection_url = get_collection_url(url, collection_id) + self.spark = SparkSession.getActiveSession() + + @staticmethod + def _move_attributes_to_properties(item_dict: dict) -> dict: + """ + Moves specified attributes from the item dictionary to the 'properties' field. + + This method ensures that certain attributes are nested under the 'properties' key + in the item dictionary. If the 'properties' key does not exist, it is initialized. + + Parameters: + - item_dict (dict): The dictionary representation of a STAC item. + + Returns: + - dict: The updated item dictionary with specified attributes moved to 'properties'. + """ + # List of attributes to move to 'properties' + attributes_to_move = [ + "title", + "description", + "keywords", + "datetime", + "start_datetime", + "end_datetime", + "created", + "instruments", + "statistics", + "platform", + "gsd", + ] + + # Initialize 'properties' if it doesn't exist + if "properties" not in item_dict: + item_dict["properties"] = {} + + # Move the specified attributes to 'properties' + for attr in attributes_to_move: + if attr in item_dict: + item_dict["properties"][attr] = str(item_dict.pop(attr)) + + return item_dict + + @staticmethod + def _apply_spatial_temporal_filters( + df: DataFrame, bbox=None, datetime=None + ) -> DataFrame: + """ + This function applies spatial and temporal filters to a Spark DataFrame. + + Parameters: + - df (DataFrame): The input Spark DataFrame to be filtered. + - bbox (Optional[list]): A list of bounding boxes for filtering the items. + Each bounding box is represented as a list of four float values: [min_lon, min_lat, max_lon, max_lat]. + Example: [[-180.0, -90.0, 180.0, 90.0]] # This bounding box covers the entire world. + - datetime (Optional[list]): A list of date-time ranges for filtering the items. + Each date-time range is represented as a list of two strings in ISO 8601 format: [start_datetime, end_datetime]. + Example: [["2020-01-01T00:00:00Z", "2021-01-01T00:00:00Z"]] # This interval covers the entire year of 2020. + + Returns: + - DataFrame: The filtered Spark DataFrame. + + The function constructs SQL conditions for spatial and temporal filters and applies them to the DataFrame. + If bbox is provided, it constructs spatial conditions using st_contains and ST_GeomFromText. + If datetime is provided, it constructs temporal conditions using the datetime column. + The conditions are combined using OR logic. + """ + if bbox: + bbox_conditions = [] + for bbox in bbox: + polygon_wkt = ( + f"POLYGON(({bbox[0]} {bbox[1]}, {bbox[2]} {bbox[1]}, " + f"{bbox[2]} {bbox[3]}, {bbox[0]} {bbox[3]}, {bbox[0]} {bbox[1]}))" + ) + bbox_conditions.append( + f"st_contains(ST_GeomFromText('{polygon_wkt}'), geometry)" + ) + bbox_sql_condition = " OR ".join(bbox_conditions) + df = df.filter(bbox_sql_condition) + + if datetime: + interval_conditions = [] + for interval in datetime: + interval_conditions.append( + f"datetime BETWEEN '{interval[0]}' AND '{interval[1]}'" + ) + interval_sql_condition = " OR ".join(interval_conditions) + df = df.filter(interval_sql_condition) + + return df + + @staticmethod + def _expand_date(date_str): + """ + Expands a simple date string to include the entire time period. + + This function takes a date string in one of the following formats: + - YYYY + - YYYY-mm + - YYYY-mm-dd + - YYYY-mm-ddTHH:MM:SSZ + + It then expands the date string to cover the entire time period for that date. + + Parameters: + - date_str (str): The date string to expand. + + Returns: + - list: A list containing the start and end datetime strings in ISO 8601 format. + + Raises: + - ValueError: If the date string format is invalid. + + Examples: + - "2017" expands to ["2017-01-01T00:00:00Z", "2017-12-31T23:59:59Z"] + - "2017-06" expands to ["2017-06-01T00:00:00Z", "2017-06-30T23:59:59Z"] + - "2017-06-10" expands to ["2017-06-10T00:00:00Z", "2017-06-10T23:59:59Z"] + - "2017-06-01T00:00:00Z" remains as ["2017-06-01T00:00:00Z", "2017-06-01T00:00:00Z"] + """ + if len(date_str) == 4: # YYYY + return [f"{date_str}-01-01T00:00:00Z", f"{date_str}-12-31T23:59:59Z"] + elif len(date_str) == 7: # YYYY-mm + year, month = date_str.split("-") + last_day = (dt(int(year), int(month) + 1, 1) - dt.timedelta(days=1)).day + return [f"{date_str}-01T00:00:00Z", f"{date_str}-{last_day}T23:59:59Z"] + elif len(date_str) == 10: # YYYY-mm-dd + return [f"{date_str}T00:00:00Z", f"{date_str}T23:59:59Z"] + elif len(date_str) == 19: # YYYY-mm-ddTHH:MM:SS + return [date_str, date_str] + elif len(date_str) == 20: # YYYY-mm-ddTHH:MM:SSZ + return [date_str, date_str] + else: + raise ValueError("Invalid date format") + + def get_items( + self, + *ids: Union[str, list], + bbox: Optional[list] = None, + datetime: Optional[Union[str, python_datetime.datetime, list]] = None, + max_items: Optional[int] = None, + ) -> Iterator[PyStacItem]: + """ + Returns an iterator of items. Each item has the supplied item ID and/or optional spatial and temporal extents. + + This method loads the collection data from the specified collection URL and applies + optional filters to the data. The filters include: + - IDs: A list of item IDs to filter the items. If not provided, no ID filtering is applied. + - bbox (Optional[list]): A list of bounding boxes for filtering the items. + - datetime (Optional[Union[str, python_datetime.datetime, list]]): A single datetime, RFC 3339-compliant timestamp, + or a list of date-time ranges for filtering the items. + - max_items (Optional[int]): The maximum number of items to return from the search, even if there are more matching results. + + Returns: + - Iterator[PyStacItem]: An iterator of PyStacItem objects that match the specified filters. + If no filters are provided, the iterator contains all items in the collection. + + Raises: + - RuntimeError: If there is an error loading the data or applying the filters, a RuntimeError + is raised with a message indicating the failure. + """ + try: + # Load the collection data from the specified collection URL + df = self.spark.read.format("stac").load(self.collection_url) + + # Apply ID filters if provided + if ids: + if isinstance(ids, tuple): + ids = list(ids) + if isinstance(ids, str): + ids = [ids] + df = df.filter(df.id.isin(ids)) + + # Ensure bbox is a list of lists + if bbox and isinstance(bbox[0], float): + bbox = [bbox] + + # Handle datetime parameter + if datetime: + if isinstance(datetime, (str, python_datetime.datetime)): + datetime = [self._expand_date(str(datetime))] + elif isinstance(datetime, list) and isinstance(datetime[0], str): + datetime = [datetime] + + # Apply spatial and temporal filters + df = self._apply_spatial_temporal_filters(df, bbox, datetime) + + # Limit the number of items if max_items is specified + if max_items is not None: + df = df.limit(max_items) + + # Collect the filtered rows and convert them to PyStacItem objects + items = [] + for row in df.collect(): + row_dict = row.asDict(True) + row_dict = self._move_attributes_to_properties(row_dict) + items.append(PyStacItem.from_dict(row_dict)) + + # Return an iterator of the items + return iter(items) + except Exception as e: + # Log the error and raise a RuntimeError + logging.error(f"Error getting items: {e}") + raise RuntimeError("Failed to get items") from e + + def get_dataframe( + self, + *ids: Union[str, list], + bbox: Optional[list] = None, + datetime: Optional[Union[str, python_datetime.datetime, list]] = None, + max_items: Optional[int] = None, + ) -> DataFrame: + """ + Returns a Spark DataFrame of items with optional spatial and temporal extents. + + This method loads the collection data from the specified collection URL and applies + optional spatial and temporal filters to the data. The spatial filter is applied using + a bounding box, and the temporal filter is applied using a date-time range. + + Parameters: + - bbox (Optional[list]): A list of bounding boxes for filtering the items. + Each bounding box is represented as a list of four float values: [min_lon, min_lat, max_lon, max_lat]. + Example: [[-180.0, -90.0, 180.0, 90.0]] # This bounding box covers the entire world. + - datetime (Optional[Union[str, python_datetime.datetime, list]]): A single datetime, RFC 3339-compliant timestamp, + or a list of date-time ranges for filtering the items. + Example: "2020-01-01T00:00:00Z" or python_datetime.datetime(2020, 1, 1) or [["2020-01-01T00:00:00Z", "2021-01-01T00:00:00Z"]] + + Returns: + - DataFrame: A Spark DataFrame containing the filtered items. If no filters are provided, + the DataFrame contains all items in the collection. + + Raises: + - RuntimeError: If there is an error loading the data or applying the filters, a RuntimeError + is raised with a message indicating the failure. + """ + try: + df = self.spark.read.format("stac").load(self.collection_url) + + # Apply ID filters if provided + if ids: + if isinstance(ids, tuple): + ids = list(ids) + if isinstance(ids, str): + ids = [ids] + df = df.filter(df.id.isin(ids)) + + # Ensure bbox is a list of lists + if bbox and isinstance(bbox[0], float): + bbox = [bbox] + + # Handle datetime parameter + if datetime: + if isinstance(datetime, (str, python_datetime.datetime)): + datetime = [[str(datetime), str(datetime)]] + elif isinstance(datetime, list) and isinstance(datetime[0], str): + datetime = [datetime] + + df = self._apply_spatial_temporal_filters(df, bbox, datetime) + + # Limit the number of items if max_items is specified + if max_items is not None: + df = df.limit(max_items) + + return df + except Exception as e: + logging.error(f"Error getting filtered dataframe: {e}") + raise RuntimeError("Failed to get filtered dataframe") from e + + def save_to_geoparquet( + self, + *ids: Union[str, list], + output_path: str, + bbox: Optional[list] = None, + datetime: Optional[list] = None, + ) -> None: + """ + Loads the STAC DataFrame and saves it to Parquet format at the given output path. + + This method loads the collection data from the specified collection URL and applies + optional spatial and temporal filters to the data. The filtered data is then saved + to the specified output path in Parquet format. + + Parameters: + - output_path (str): The path where the Parquet file will be saved. + - spatial_extent (Optional[SpatialExtent]): A spatial extent object that defines the + bounding box for filtering the items. If not provided, no spatial filtering is applied. + - temporal_extent (Optional[TemporalExtent]): A temporal extent object that defines the + date-time range for filtering the items. If not provided, no temporal filtering is applied. + To match a single datetime, you can set the start and end datetime to the same value in the datetime. + Here is an example: [["2020-01-01T00:00:00Z", "2020-01-01T00:00:00Z"]] + + Raises: + - RuntimeError: If there is an error loading the data, applying the filters, or saving the + DataFrame to Parquet format, a RuntimeError is raised with a message indicating the failure. + """ + try: + df = self.get_dataframe(*ids, bbox=bbox, datetime=datetime) + df_geoparquet = self._convert_assets_schema(df) + df_geoparquet.write.format("geoparquet").save(output_path) + logging.info(f"DataFrame successfully saved to {output_path}") + except Exception as e: + logging.error(f"Error saving DataFrame to GeoParquet: {e}") + raise RuntimeError("Failed to save DataFrame to GeoParquet") from e + + @staticmethod + def _convert_assets_schema(df: DataFrame) -> DataFrame: + """ + Converts the schema of the assets column in the DataFrame to have a consistent structure. + + This function first identifies all unique keys in the assets column and then ensures that + each row in the DataFrame has these keys with appropriate values. + + The expected input schema of the loaded dataframe (df) can be found here: + https://sedona.apache.org/latest-snapshot/api/sql/Stac/#usage + + Parameters: + - df (DataFrame): The input DataFrame with an assets column. + + Returns: + - DataFrame: The DataFrame with a consistent schema for the assets column. + """ + from pyspark.sql.functions import col, explode, struct + + # Explode the assets column to get all unique keys and their corresponding value struct + exploded_df = df.select(explode("assets").alias("key", "value")) + unique_keys = [ + row["key"] for row in exploded_df.select("key").distinct().collect() + ] + + # Create a new schema with all unique keys and their value struct + new_schema = struct( + [struct(col(f"assets.{key}.*")).alias(key) for key in unique_keys] + ) + + # Apply the new schema to the assets column + df = df.withColumn("assets", new_schema) + + return df + + def __str__(self): + return f"" diff --git a/python/sedona/stats/clustering/dbscan.py b/python/sedona/stats/clustering/dbscan.py index f1501963db..28b37d8bdc 100644 --- a/python/sedona/stats/clustering/dbscan.py +++ b/python/sedona/stats/clustering/dbscan.py @@ -34,6 +34,8 @@ def dbscan( geometry: Optional[str] = None, include_outliers: bool = True, use_spheroid=False, + is_core_column_name="isCore", + cluster_column_name="cluster", ): """Annotates a dataframe with a cluster label for each data record using the DBSCAN algorithm. @@ -49,6 +51,8 @@ def dbscan( include_outliers: whether to return outlier points. If True, outliers are returned with a cluster value of -1. Default is False use_spheroid: whether to use a cartesian or spheroidal distance calculation. Default is false + is_core_column_name: what the name of the column indicating if this is a core point should be. Default is "isCore" + cluster_column_name: what the name of the column indicating the cluster id should be. Default is "cluster" Returns: A PySpark DataFrame containing the cluster label for each row @@ -62,6 +66,8 @@ def dbscan( geometry, include_outliers, use_spheroid, + is_core_column_name, + cluster_column_name, ) return DataFrame(result_df, sedona) diff --git a/python/sedona/stats/outlier_detection/local_outlier_factor.py b/python/sedona/stats/outlier_detection/local_outlier_factor.py index 3050d216b7..7a29f5c508 100644 --- a/python/sedona/stats/outlier_detection/local_outlier_factor.py +++ b/python/sedona/stats/outlier_detection/local_outlier_factor.py @@ -30,6 +30,7 @@ def local_outlier_factor( geometry: Optional[str] = None, handle_ties: bool = False, use_spheroid=False, + result_column_name: str = "lof", ): """Annotates a dataframe with a column containing the local outlier factor for each data record. @@ -43,6 +44,7 @@ def local_outlier_factor( geometry: name of the geometry column handle_ties: whether to handle ties in the k-distance calculation. Default is false use_spheroid: whether to use a cartesian or spheroidal distance calculation. Default is false + result_column_name: the name of the column containing the lof for each row. Default is "lof" Returns: A PySpark DataFrame containing the lof for each row @@ -55,6 +57,7 @@ def local_outlier_factor( geometry, handle_ties, use_spheroid, + result_column_name, ) return DataFrame(result_df, sedona) diff --git a/python/sedona/stats/weighting.py b/python/sedona/stats/weighting.py index 8a5fc7e07a..7b5eb9be9d 100644 --- a/python/sedona/stats/weighting.py +++ b/python/sedona/stats/weighting.py @@ -17,7 +17,7 @@ """Weighting functions for spatial data.""" -from typing import Optional +from typing import Optional, List from pyspark.sql import DataFrame, SparkSession @@ -32,6 +32,8 @@ def add_distance_band_column( self_weight: float = 1.0, geometry: Optional[str] = None, use_spheroid: bool = False, + saved_attributes: List[str] = None, + result_name: str = "weights", ) -> DataFrame: """Annotates a dataframe with a weights column containing the other records within the threshold and their weight. @@ -51,7 +53,8 @@ def add_distance_band_column( self_weight: the value to use for the self weight geometry: name of the geometry column use_spheroid: whether to use a cartesian or spheroidal distance calculation. Default is false - + saved_attributes: the attributes to save in the neighbor column. Default is all columns. + result_name: the name of the resulting column. Default is 'weights'. Returns: The input DataFrame with a weight column added containing neighbors and their weights added to each row. @@ -67,6 +70,8 @@ def add_distance_band_column( float(self_weight), geometry, use_spheroid, + saved_attributes, + result_name, ) @@ -77,6 +82,8 @@ def add_binary_distance_band_column( include_self: bool = False, geometry: Optional[str] = None, use_spheroid: bool = False, + saved_attributes: List[str] = None, + result_name: str = "weights", ) -> DataFrame: """Annotates a dataframe with a weights column containing the other records within the threshold and their weight. @@ -93,6 +100,59 @@ def add_binary_distance_band_column( include_self: whether to include self in the list of neighbors geometry: name of the geometry column use_spheroid: whether to use a cartesian or spheroidal distance calculation. Default is false + saved_attributes: the attributes to save in the neighbor column. Default is all columns. + result_name: the name of the resulting column. Default is 'weights'. + + Returns: + The input DataFrame with a weight column added containing neighbors and their weights (always 1) added to each + row. + + """ + sedona = SparkSession.getActiveSession() + + return sedona._jvm.org.apache.sedona.stats.Weighting.addBinaryDistanceBandColumn( + dataframe._jdf, + float(threshold), + include_zero_distance_neighbors, + include_self, + geometry, + use_spheroid, + saved_attributes, + result_name, + ) + + +def add_weighted_distance_band_column( + dataframe: DataFrame, + threshold: float, + alpha: float, + include_zero_distance_neighbors: bool = True, + include_self: bool = False, + self_weight: float = 1.0, + geometry: Optional[str] = None, + use_spheroid: bool = False, + saved_attributes: List[str] = None, + result_name: str = "weights", +) -> DataFrame: + """Annotates a dataframe with a weights column containing the other records within the threshold and their weight. + + Weights will be distance^alpha. The dataframe should contain at least one GeometryType column. Rows must be unique. If + one geometry column is present it will be used automatically. If two are present, the one named 'geometry' will be + used. If more than one are present and neither is named 'geometry', the column name must be provided. The new column + will be named 'cluster'. + + Args: + dataframe: DataFrame with geometry column + threshold: Distance threshold for considering neighbors + alpha: alpha to use for inverse distance weights. Computation is dist^alpha. Default is -1.0 + include_zero_distance_neighbors: whether to include neighbors that are 0 distance. If 0 distance neighbors are + included and binary is false, values are infinity as per the floating point spec (divide by 0) + include_self: whether to include self in the list of neighbors + self_weight: the value to use for the self weight. Default is 1.0 + geometry: name of the geometry column + use_spheroid: whether to use a cartesian or spheroidal distance calculation. Default is false + saved_attributes: the attributes to save in the neighbor column. Default is all columns. + result_name: the name of the resulting column. Default is 'weights'. Returns: The input DataFrame with a weight column added containing neighbors and their weights (always 1) added to each @@ -100,11 +160,16 @@ def add_binary_distance_band_column( """ sedona = SparkSession.getActiveSession() + return sedona._jvm.org.apache.sedona.stats.Weighting.addBinaryDistanceBandColumn( dataframe._jdf, float(threshold), + float(alpha), include_zero_distance_neighbors, include_self, + float(self_weight), geometry, use_spheroid, + saved_attributes, + result_name, ) diff --git a/python/sedona/utils/adapter.py b/python/sedona/utils/adapter.py index b692fd2fdb..7b978e7212 100644 --- a/python/sedona/utils/adapter.py +++ b/python/sedona/utils/adapter.py @@ -29,16 +29,14 @@ class Adapter(metaclass=MultipleMeta): """ Class which allow to convert between Spark DataFrame and SpatialRDD and reverse. + This class is used to convert between PySpark DataFrame and SpatialRDD. Schema + is lost during the conversion. This should be used if your data starts as a + SpatialRDD and you want to convert it to a DataFrame for further processing. """ @staticmethod def _create_dataframe(jdf, sparkSession: SparkSession) -> DataFrame: - if hasattr(sparkSession, "_wrapped"): - # In Spark < 3.3, use the _wrapped SQLContext - return DataFrame(jdf, sparkSession._wrapped) - else: - # In Spark >= 3.3, use the session directly - return DataFrame(jdf, sparkSession) + return DataFrame(jdf, sparkSession) @classmethod def toRdd(cls, dataFrame: DataFrame) -> "JvmSpatialRDD": diff --git a/python/sedona/utils/geoarrow.py b/python/sedona/utils/geoarrow.py new file mode 100644 index 0000000000..b8ade8528b --- /dev/null +++ b/python/sedona/utils/geoarrow.py @@ -0,0 +1,188 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# We may be able to achieve streaming rather than complete materialization by using +# with the ArrowStreamSerializer (instead of the ArrowCollectSerializer) + + +from sedona.sql.types import GeometryType +from sedona.sql.st_functions import ST_AsEWKB + + +def dataframe_to_arrow(df, crs=None): + """ + Collect a DataFrame as a PyArrow Table + + In the output Table, geometry will be encoded as a GeoArrow extension type. + The resulting output is compatible with `lonboard.viz()`, + `geopandas.GeoDataFrame.from_arrow()`, or any library compatible with + GeoArrow extension types. + + :param df: A Spark DataFrame + :param crs: A CRS-like object (e.g., `pyproj.CRS` or string interpretable by + `pyproj.CRS`). If provided, this will override any CRS present in the output + geometries. If omitted, the CRS will be inferred from the values present in + the output if exactly one CRS is present in the output. + :return: + """ + import pyarrow as pa + + col_is_geometry = [isinstance(f.dataType, GeometryType) for f in df.schema.fields] + + if not any(col_is_geometry): + return dataframe_to_arrow_raw(df) + + df_columns = list(df) + df_column_names = df.schema.fieldNames() + for i, is_geom in enumerate(col_is_geometry): + if is_geom: + df_columns[i] = ST_AsEWKB(df_columns[i]).alias(df_column_names[i]) + + df_projected = df.select(*df_columns) + table = dataframe_to_arrow_raw(df_projected) + + try: + # Using geoarrow-types is the preferred mechanism for Arrow output. + # Using the extension type ensures that the type and its metadata will + # propagate through all pyarrow transformations. + import geoarrow.types as gat + from geoarrow.types.type_pyarrow import register_extension_types + + register_extension_types() + spec = gat.wkb() + + new_cols = [ + wrap_geoarrow_extension(col, spec, crs) if is_geom else col + for is_geom, col in zip(col_is_geometry, table.columns) + ] + + return pa.table(new_cols, table.column_names) + except ImportError: + # In the event that we don't have access to GeoArrow extension types, + # we can still add field metadata that will propagate through some types + # of operations (e.g., writing this table to a file or passing it to + # DuckDB as long as no intermediate transformations were applied). + new_fields = [ + ( + wrap_geoarrow_field(table.schema.field(i), table[i], crs) + if is_geom + else table.schema.field(i) + ) + for i, is_geom in enumerate(col_is_geometry) + ] + + return table.from_arrays(table.columns, schema=pa.schema(new_fields)) + + +def dataframe_to_arrow_raw(df): + """Backport of toArrow() (available in Spark 4.0)""" + from pyspark.sql.dataframe import DataFrame + + assert isinstance(df, DataFrame) + + jconf = df.sparkSession._jconf + + from pyspark.sql.pandas.types import to_arrow_schema + from pyspark.sql.pandas.utils import require_minimum_pyarrow_version + + require_minimum_pyarrow_version() + schema = to_arrow_schema(df.schema) + + import pyarrow as pa + + self_destruct = jconf.arrowPySparkSelfDestructEnabled() + batches = df._collect_as_arrow(split_batches=self_destruct) + table = pa.Table.from_batches(batches).cast(schema) + # Ensure only the table has a reference to the batches, so that + # self_destruct (if enabled) is effective + del batches + return table + + +def wrap_geoarrow_extension(col, spec, crs): + if crs is None: + crs = unique_srid_from_ewkb(col) + elif not hasattr(crs, "to_json"): + import pyproj + + crs = pyproj.CRS(crs) + + return spec.override(crs=crs).to_pyarrow().wrap_array(col) + + +def wrap_geoarrow_field(field, col, crs): + if crs is None: + crs = unique_srid_from_ewkb(col) + + if crs is not None: + metadata = f'"crs": {crs_to_json(crs)}' + else: + metadata = "" + + return field.with_metadata( + { + "ARROW:extension:name": "geoarrow.wkb", + "ARROW:extension:metadata": "{" + metadata + "}", + } + ) + + +def crs_to_json(crs): + if hasattr(crs, "to_json"): + return crs.to_json() + else: + import pyproj + + return pyproj.CRS(crs).to_json() + + +def unique_srid_from_ewkb(obj): + import pyarrow as pa + import pyarrow.compute as pc + + if len(obj) == 0: + return None + + # Output shouldn't have mixed endian here + endian = pc.binary_slice(obj, 0, 1).unique() + if len(endian) != 1: + return None + + # WKB Z high byte is 0x80 + # WKB M high byte is is 0x40 + # EWKB SRID high byte is 0x20 + # High bytes where the SRID is set would be + # [0x20, 0x20 | 0x40, 0x20 | 0x80, 0x20 | 0x40 | 0x80] + # == [0x20, 0x60, 0xa0, 0xe0] + is_little_endian = endian[0].as_py() == b"\x01" + high_byte = ( + pc.binary_slice(obj, 4, 5) if is_little_endian else pc.binary_slice(obj, 1, 2) + ) + has_srid = pc.is_in(high_byte, pa.array([b"\x20", b"\x60", b"\xa0", b"\xe0"])) + unique_srids = ( + pc.if_else(has_srid, pc.binary_slice(obj, 5, 9), None).unique().drop_null() + ) + if len(unique_srids) != 1: + return None + + srid_bytes = unique_srids[0].as_py() + endian = "little" if is_little_endian else "big" + epsg_code = int.from_bytes(srid_bytes, endian) + + import pyproj + + return pyproj.CRS(f"EPSG:{epsg_code}") diff --git a/python/sedona/utils/structured_adapter.py b/python/sedona/utils/structured_adapter.py new file mode 100644 index 0000000000..bc3b959363 --- /dev/null +++ b/python/sedona/utils/structured_adapter.py @@ -0,0 +1,126 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql.types import StructType + +from sedona.core.SpatialRDD.spatial_rdd import SpatialRDD +from sedona.core.spatialOperator.rdd import SedonaPairRDD + + +class StructuredAdapter: + """ + Class which allow to convert between Spark DataFrame and SpatialRDD and reverse. + This class is used to convert between PySpark DataFrame and SpatialRDD. Schema + is lost during the conversion. This should be used if your data starts as a + SpatialRDD and you want to convert it to a DataFrame for further processing. + """ + + @staticmethod + def _create_dataframe(jdf, sparkSession: SparkSession) -> DataFrame: + return DataFrame(jdf, sparkSession) + + @classmethod + def toSpatialRdd( + cls, dataFrame: DataFrame, geometryFieldName: str = None + ) -> SpatialRDD: + """ + Convert a DataFrame to a SpatialRDD + :param dataFrame: + :param geometryFieldName: + :return: + """ + sc = dataFrame._sc + jvm = sc._jvm + if geometryFieldName is None: + srdd = jvm.StructuredAdapter.toSpatialRdd(dataFrame._jdf) + else: + srdd = jvm.StructuredAdapter.toSpatialRdd(dataFrame._jdf, geometryFieldName) + + spatial_rdd = SpatialRDD(sc) + spatial_rdd.set_srdd(srdd) + + return spatial_rdd + + @classmethod + def toDf(cls, spatialRDD: SpatialRDD, sparkSession: SparkSession) -> DataFrame: + """ + Convert a SpatialRDD to a DataFrame + :param spatialRDD: + :param sparkSession: + :return: + """ + sc = spatialRDD._sc + jvm = sc._jvm + + jdf = jvm.StructuredAdapter.toDf(spatialRDD._srdd, sparkSession._jsparkSession) + + df = StructuredAdapter._create_dataframe(jdf, sparkSession) + + return df + + @classmethod + def toSpatialPartitionedDf( + cls, spatialRDD: SpatialRDD, sparkSession: SparkSession + ) -> DataFrame: + """ + Convert a SpatialRDD to a DataFrame. This DataFrame will be spatially partitioned + :param spatialRDD: + :param sparkSession: + :return: + """ + sc = spatialRDD._sc + jvm = sc._jvm + + jdf = jvm.StructuredAdapter.toSpatialPartitionedDf( + spatialRDD._srdd, sparkSession._jsparkSession + ) + + df = StructuredAdapter._create_dataframe(jdf, sparkSession) + + return df + + @classmethod + def pairRddToDf( + cls, + rawPairRDD: SedonaPairRDD, + left_schema: StructType, + right_schema: StructType, + sparkSession: SparkSession, + ) -> DataFrame: + """ + Convert a raw pair RDD to a DataFrame. This is useful when you have a Spatial join result + Args: + rawPairRDD: + left_schema: + right_schema: + sparkSession: + + Returns: + + """ + jvm = sparkSession._jvm + left_schema_json = left_schema.json() + right_schema_json = right_schema.json() + jdf = jvm.StructuredAdapter.toDf( + rawPairRDD.jsrdd, + left_schema_json, + right_schema_json, + sparkSession._jsparkSession, + ) + df = StructuredAdapter._create_dataframe(jdf, sparkSession) + return df diff --git a/python/tests/spatial_rdd/test_spatial_rdd.py b/python/tests/spatial_rdd/test_spatial_rdd.py index ae1dd2f628..4230b87c53 100644 --- a/python/tests/spatial_rdd/test_spatial_rdd.py +++ b/python/tests/spatial_rdd/test_spatial_rdd.py @@ -26,6 +26,7 @@ from sedona.core.enums import FileDataSplitter, GridType, IndexType from sedona.core.formatMapper.geo_json_reader import GeoJsonReader +from sedona.utils.adapter import Adapter from sedona.core.geom.envelope import Envelope from sedona.core.SpatialRDD import PointRDD @@ -126,6 +127,10 @@ def test_get_partitioner(self): else: assert spatial_rdd.getPartitioner().name == "FlatGridPartitioner" + grids = spatial_rdd.getPartitioner().getGrids() + assert len(grids) > 0 + assert all(isinstance(grid, Envelope) for grid in grids) + def test_get_raw_spatial_rdd(self): spatial_rdd = self.create_spatial_rdd() assert isinstance(spatial_rdd.getRawSpatialRDD(), RDD) @@ -154,3 +159,24 @@ def test_partition_tree(self): spatial_rdd.spatialPartitioning(GridType.QUADTREE) print(spatial_rdd.getPartitioner()) + + def test_partition_unique(self): + grids = [ + Envelope(0.0, 10.0, 0.0, 10.0), + Envelope(10.0, 20.0, 0.0, 10.0), + Envelope(0.0, 10.0, 10.0, 20.0), + Envelope(10.0, 20.0, 10.0, 20.0), + ] + + df = self.spark.createDataFrame( + [("POLYGON ((5 5, 15 5, 15 15, 5 15, 5 5))",)], ["wkt"] + ).selectExpr("ST_GeomFromText(wkt) as geometry") + spatial_rdd = Adapter.toSpatialRdd(df, "geometry") + + spatial_rdd.spatialPartitioning(grids) + assert spatial_rdd.spatialPartitionedRDD.count() == 5 + assert spatial_rdd.getPartitioner().getGrids() == grids + + spatial_rdd.spatialPartitioningWithoutDuplicates(grids) + assert spatial_rdd.spatialPartitionedRDD.count() == 1 + spatial_rdd.getPartitioner().getGrids() == grids diff --git a/python/tests/sql/test_dataframe_api.py b/python/tests/sql/test_dataframe_api.py index 5683d6d29c..de65f6f0f4 100644 --- a/python/tests/sql/test_dataframe_api.py +++ b/python/tests/sql/test_dataframe_api.py @@ -15,6 +15,9 @@ # specific language governing permissions and limitations # under the License. from math import radians +import os +import threading +import concurrent.futures from typing import Callable, Tuple import pytest @@ -267,6 +270,27 @@ "", "LINESTRING (0 0, 1 1, 1 0, 2 0, 3 0, 4 0, 5 0)", ), + ( + stf.ST_LabelPoint, + ("geom",), + "triangle_geom", + "", + "POINT (0.6666666666666666 0.3333333333333333)", + ), + ( + stf.ST_LabelPoint, + ("geom", 1), + "triangle_geom", + "", + "POINT (0.6666666666666666 0.3333333333333333)", + ), + ( + stf.ST_LabelPoint, + ("geom", 1, 0.5), + "triangle_geom", + "", + "POINT (0.6666666666666666 0.3333333333333333)", + ), ( stf.ST_Angle, ( @@ -662,6 +686,14 @@ "", "LINESTRING (0 0, 1 0, 1 1, 0 0)", ), + (stf.ST_LineSegments, ("line",), "linestring_geom", "array_size(geom)", 5), + ( + stf.ST_LineSegments, + ("geom", True), + "polygon_unsimplified", + "array_size(geom)", + 0, + ), ( stf.ST_LineSubstring, ("line", 0.5, 1.0), @@ -740,6 +772,15 @@ "ceil(geom)", 378794, ), + (stf.ST_Perimeter2D, ("geom",), "triangle_geom", "", 3.414213562373095), + (stf.ST_Perimeter2D, ("geom", True), "triangle_geom", "ceil(geom)", 378794), + ( + stf.ST_Perimeter2D, + (lambda: stf.ST_SetSRID("geom", 4326), True), + "triangle_geom", + "ceil(geom)", + 378794, + ), ( stf.ST_Points, ("line",), @@ -1287,6 +1328,7 @@ (stf.ST_IsValidDetail, (None,)), (stf.ST_IsValid, (None,)), (stf.ST_IsValidReason, (None,)), + (stf.ST_LabelPoint, (None,)), (stf.ST_Length, (None,)), (stf.ST_Length2D, (None,)), (stf.ST_LineFromMultiPoint, (None,)), @@ -1692,3 +1734,43 @@ def test_call_function_with_wrong_type(self, func, args): match=f"Incorrect argument type: [A-Za-z_0-9]+ for {func.__name__} should be [A-Za-z0-9\\[\\]_, ]+ but received [A-Za-z0-9_]+.", ): func(*args) + + def test_multi_thread(self): + df = self.spark.range(0, 100) + + def run_spatial_query(): + result = df.select( + stf.ST_Buffer(stc.ST_Point("id", f.col("id") + 1), 1.0).alias("geom") + ).collect() + assert len(result) == 100 + + # Create and run 4 threads + with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: + futures = [executor.submit(run_spatial_query) for _ in range(4)] + concurrent.futures.wait(futures) + for future in futures: + future.result() + + @pytest.mark.skipif( + os.getenv("SPARK_REMOTE") is not None, + reason="Checkpoint dir is not available in Spark Connect", + ) + def test_dbscan(self): + df = self.spark.createDataFrame([{"id": 1, "x": 2, "y": 3}]).withColumn( + "geometry", f.expr("ST_Point(x, y)") + ) + + df.withColumn("dbscan", ST_DBSCAN("geometry", 1.0, 2, False)).collect() + + @pytest.mark.skipif( + os.getenv("SPARK_REMOTE") is not None, + reason="Checkpoint dir is not available in Spark Connect", + ) + def test_lof(self): + df = self.spark.createDataFrame([{"id": 1, "x": 2, "y": 3}]).withColumn( + "geometry", f.expr("ST_Point(x, y)") + ) + + df.withColumn( + "localOutlierFactor", ST_LocalOutlierFactor("geometry", 2, False) + ).collect() diff --git a/python/tests/sql/test_function.py b/python/tests/sql/test_function.py index 6ea0d944bd..96f31e4d94 100644 --- a/python/tests/sql/test_function.py +++ b/python/tests/sql/test_function.py @@ -54,6 +54,25 @@ class TestPredicateJoin(TestBase): ] ) + def test_ST_LabelPoint(self): + geom_expr = "ST_GeomFromWKT('POLYGON ((-112.637484 33.440546, -112.546852 33.477209, -112.489177 33.550488, -112.41777 33.751684, -111.956371 33.719707, -111.766868 33.616843, -111.775107 33.527595, -111.640533 33.504695, -111.440044 33.463462, -111.415326 33.374055, -111.514197 33.309809, -111.643279 33.222542, -111.893203 33.174278, -111.96461 33.250109, -112.123903 33.261593, -112.252985 33.35341, -112.406784 33.346527, -112.667694 33.316695, -112.637484 33.440546))')" + function_df = self.spark.sql(f"select ST_LabelPoint({geom_expr})") + actual = function_df.take(1)[0][0] + expected = "POINT (-112.04278737349767 33.46420809489905)" + self.assert_geometry_almost_equal(expected, actual, 0.1) + + geom_expr = "ST_GeomFromWKT('GEOMETRYCOLLECTION(POLYGON ((-112.840785 33.435962, -112.840785 33.708284, -112.409597 33.708284, -112.409597 33.435962, -112.840785 33.435962)), POLYGON ((-112.309264 33.398167, -112.309264 33.746007, -111.787444 33.746007, -111.787444 33.398167, -112.309264 33.398167)))')" + function_df = self.spark.sql(f"select ST_LabelPoint({geom_expr}, 1)") + actual = function_df.take(1)[0][0] + expected = "POINT (-112.04835399999999 33.57208699999999)" + self.assert_geometry_almost_equal(expected, actual, 0.1) + + geom_expr = "ST_GeomFromWKT('POLYGON ((-112.654072 33.114485, -112.313516 33.653431, -111.63515 33.314399, -111.497829 33.874913, -111.692825 33.431378, -112.376684 33.788215, -112.654072 33.114485))')" + function_df = self.spark.sql(f"select ST_LabelPoint({geom_expr}, 1, 0.1)") + actual = function_df.take(1)[0][0] + expected = "POINT (-112.0722602222832 33.53914975012836)" + self.assert_geometry_almost_equal(expected, actual, 0.1) + def test_st_concave_hull(self): polygon_wkt_df = ( self.spark.read.format("csv") @@ -1666,6 +1685,25 @@ def test_st_perimeter(self): expected = 443770.91724830196 assert expected == actual + def test_st_perimeter2D(self): + baseDf = self.spark.sql( + "SELECT ST_GeomFromWKT('POLYGON((743238 2967416,743238 2967450,743265 2967450,743265.625 2967416,743238 2967416))') AS geom" + ) + actual = baseDf.selectExpr("ST_Perimeter2D(geom)").take(1)[0][0] + expected = 122.63074400009504 + assert actual == expected + + baseDf = self.spark.sql( + "SELECT ST_GeomFromWKT('POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))', 4326) AS geom" + ) + actual = baseDf.selectExpr("ST_Perimeter2D(geom, true)").first()[0] + expected = 443770.91724830196 + assert expected == actual + + actual = baseDf.selectExpr("ST_Perimeter2D(geom, true, false)").first()[0] + expected = 443770.91724830196 + assert expected == actual + def test_st_points(self): # Given geometry_df = self.spark.createDataFrame( @@ -2124,6 +2162,19 @@ def test_st_buildarea(self): ) assert areal_geom.take(1)[0][0] == expected_geom + def test_st_line_segments(self): + baseDf = self.spark.sql( + "SELECT ST_GeomFromWKT('LINESTRING(120 140, 60 120, 30 20)') AS line, ST_GeomFromWKT('POLYGON ((0 0, 0 1, 1 0, 0 0))') AS poly" + ) + resultSize = baseDf.selectExpr( + "array_size(ST_LineSegments(line, false))" + ).first()[0] + expected = 2 + assert expected == resultSize + + resultSize = baseDf.selectExpr("array_size(ST_LineSegments(poly))").first()[0] + assert 0 == resultSize + def test_st_line_from_multi_point(self): test_cases = { "'POLYGON((-1 0 0, 1 0 0, 0 0 1, 0 1 0, -1 0 0))'": None, diff --git a/python/tests/sql/test_structured_adapter.py b/python/tests/sql/test_structured_adapter.py new file mode 100644 index 0000000000..f14a48f33e --- /dev/null +++ b/python/tests/sql/test_structured_adapter.py @@ -0,0 +1,82 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import glob +import tempfile + +from pyspark.sql import DataFrame + +from sedona.core.SpatialRDD import CircleRDD +from sedona.core.enums import GridType +from sedona.core.spatialOperator import JoinQueryRaw +from sedona.utils.structured_adapter import StructuredAdapter +from tests.test_base import TestBase + + +class TestStructuredAdapter(TestBase): + + def test_df_rdd(self): + spatial_df: DataFrame = self.spark.sql("select ST_MakePoint(1, 1) as geom") + srdd = StructuredAdapter.toSpatialRdd(spatial_df, "geom") + spatial_df = StructuredAdapter.toDf(srdd, self.spark) + assert spatial_df.count() == 1 + + def test_spatial_partitioned_df(self): + spatial_df: DataFrame = self.spark.sql("select ST_MakePoint(1, 1) as geom") + srdd = StructuredAdapter.toSpatialRdd(spatial_df, "geom") + srdd.analyze() + srdd.spatialPartitioning(GridType.KDBTREE, 1) + spatial_df = StructuredAdapter.toSpatialPartitionedDf(srdd, self.spark) + assert spatial_df.count() == 1 + + def test_distance_join_result_to_dataframe(self): + spatial_df: DataFrame = self.spark.sql("select ST_MakePoint(1, 1) as geom") + schema = spatial_df.schema + srdd = StructuredAdapter.toSpatialRdd(spatial_df, "geom") + srdd.analyze() + + circle_rdd = CircleRDD(srdd, 0.001) + + srdd.spatialPartitioning(GridType.QUADTREE) + circle_rdd.spatialPartitioning(srdd.getPartitioner()) + + join_result_pair_rdd = JoinQueryRaw.DistanceJoinQueryFlat( + srdd, circle_rdd, False, True + ) + + join_result_df = StructuredAdapter.pairRddToDf( + join_result_pair_rdd, schema, schema, self.spark + ) + assert join_result_df.count() == 1 + + def test_spatial_partitioned_write(self): + xys = [(i, i // 100, i % 100) for i in range(1_000)] + df = self.spark.createDataFrame(xys, ["id", "x", "y"]).selectExpr( + "id", "ST_Point(x, y) AS geom" + ) + + rdd = StructuredAdapter.toSpatialRdd(df, "geom") + rdd.analyze() + rdd.spatialPartitioningWithoutDuplicates(GridType.KDBTREE, num_partitions=16) + n_spatial_partitions = rdd.spatialPartitionedRDD.getNumPartitions() + assert n_spatial_partitions >= 16 + + partitioned_df = StructuredAdapter.toSpatialPartitionedDf(rdd, self.spark) + + with tempfile.TemporaryDirectory() as td: + out = td + "/out" + partitioned_df.write.format("geoparquet").save(out) + assert len(glob.glob(out + "/*.parquet")) == n_spatial_partitions diff --git a/python/tests/stac/__init__.py b/python/tests/stac/__init__.py new file mode 100644 index 0000000000..a67d5ea255 --- /dev/null +++ b/python/tests/stac/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/tests/stac/test_client.py b/python/tests/stac/test_client.py new file mode 100644 index 0000000000..5c6192258a --- /dev/null +++ b/python/tests/stac/test_client.py @@ -0,0 +1,146 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from sedona.stac.client import Client +from pyspark.sql import DataFrame + +from tests.test_base import TestBase + +STAC_URLS = { + "PLANETARY-COMPUTER": "https://planetarycomputer.microsoft.com/api/stac/v1" +} + + +class TestStacClient(TestBase): + def test_collection_client(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + items = client.search( + collection_id="aster-l1t", + bbox=[-100.0, -72.0, 105.0, -69.0], + datetime=["2006-01-01T00:00:00Z", "2007-01-01T00:00:00Z"], + return_dataframe=False, + ) + assert items is not None + assert len(list(items)) == 2 + + def test_search_with_ids(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + items = client.search( + *["AST_L1T_00312272006020322_20150518201805", "item2"], + collection_id="aster-l1t", + return_dataframe=False, + ) + assert items is not None + assert len(list(items)) == 1 + + def test_search_with_single_id(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + items = client.search( + "AST_L1T_00312272006020322_20150518201805", + collection_id="aster-l1t", + return_dataframe=False, + ) + assert items is not None + assert len(list(items)) == 1 + + def test_search_with_bbox_and_datetime(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + items = client.search( + collection_id="aster-l1t", + bbox=[-180.0, -90.0, 180.0, 90.0], + datetime=["2006-01-01T00:00:00Z", "2007-01-01T00:00:00Z"], + return_dataframe=False, + ) + assert items is not None + assert len(list(items)) > 0 + + def test_search_with_multiple_bboxes_and_intervals(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + items = client.search( + collection_id="aster-l1t", + bbox=[ + [90, -73, 105, -69], + [-180.0, -90.0, -170.0, -80.0], + [-100.0, -72.0, -90.0, -62.0], + ], + datetime=[["2006-12-01T00:00:00Z", "2006-12-27T03:00:00Z"]], + return_dataframe=False, + ) + assert items is not None + assert len(list(items)) == 4 + + def test_search_with_bbox_and_non_overlapping_intervals(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + items = client.search( + collection_id="aster-l1t", + bbox=[-180.0, -90.0, 180.0, 90.0], + datetime=[ + ["2006-01-01T00:00:00Z", "2006-06-01T00:00:00Z"], + ["2006-07-01T00:00:00Z", "2007-01-01T00:00:00Z"], + ], + return_dataframe=False, + ) + assert items is not None + assert len(list(items)) == 10 + + def test_search_with_max_items(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + items = client.search( + collection_id="aster-l1t", + bbox=[-180.0, -90.0, 180.0, 90.0], + datetime=["2006-01-01T00:00:00Z", "2007-01-01T00:00:00Z"], + max_items=5, + return_dataframe=False, + ) + assert items is not None + assert len(list(items)) == 5 + + def test_search_with_single_datetime(self) -> None: + from datetime import datetime + + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + items = client.search( + collection_id="aster-l1t", + bbox=[-180.0, -90.0, 180.0, 90.0], + datetime=datetime(2006, 12, 26, 18, 3, 22), + return_dataframe=False, + ) + assert items is not None + assert len(list(items)) == 0 + + def test_search_with_YYYY(self) -> None: + from datetime import datetime + + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + items = client.search( + collection_id="aster-l1t", + bbox=[-180.0, -90.0, 180.0, 90.0], + datetime="2006", + return_dataframe=False, + ) + assert items is not None + assert len(list(items)) == 10 + + def test_search_with_return_dataframe(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + df = client.search( + collection_id="aster-l1t", + bbox=[-180.0, -90.0, 180.0, 90.0], + datetime=["2006-01-01T00:00:00Z", "2007-01-01T00:00:00Z"], + ) + assert df is not None + assert isinstance(df, DataFrame) diff --git a/python/tests/stac/test_collection_client.py b/python/tests/stac/test_collection_client.py new file mode 100644 index 0000000000..c30105a4eb --- /dev/null +++ b/python/tests/stac/test_collection_client.py @@ -0,0 +1,189 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from sedona.stac.client import Client +from sedona.stac.collection_client import CollectionClient + +from tests.test_base import TestBase + +STAC_URLS = { + "PLANETARY-COMPUTER": "https://planetarycomputer.microsoft.com/api/stac/v1" +} + + +class TestStacReader(TestBase): + def test_collection_client(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + collection = client.get_collection("aster-l1t") + + assert isinstance(collection, CollectionClient) + assert str(collection) == "" + + def test_get_dataframe_no_filters(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + collection = client.get_collection("aster-l1t") + df = collection.get_dataframe() + assert df is not None + assert df.count() == 10 + + def test_get_dataframe_with_spatial_extent(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + collection = client.get_collection("aster-l1t") + bbox = [[-180.0, -90.0, 180.0, 90.0]] + df = collection.get_dataframe(bbox=bbox) + assert df is not None + assert df.count() > 0 + + def test_get_dataframe_with_temporal_extent(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + collection = client.get_collection("aster-l1t") + datetime = [["2006-01-01T00:00:00Z", "2007-01-01T00:00:00Z"]] + df = collection.get_dataframe(datetime=datetime) + assert df is not None + assert df.count() > 0 + + def test_get_dataframe_with_both_extents(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + collection = client.get_collection("aster-l1t") + bbox = [[-180.0, -90.0, 180.0, 90.0]] + datetime = [["2006-01-01T00:00:00Z", "2007-01-01T00:00:00Z"]] + df = collection.get_dataframe(bbox=bbox, datetime=datetime) + assert df is not None + assert df.count() > 0 + + def test_get_items_with_spatial_extent(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + collection = client.get_collection("aster-l1t") + bbox = [[-100.0, -72.0, 105.0, -69.0]] + items = list(collection.get_items(bbox=bbox)) + assert items is not None + assert len(items) == 2 + + def test_get_items_with_temporal_extent(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + collection = client.get_collection("aster-l1t") + datetime = [["2006-12-01T00:00:00Z", "2006-12-27T02:00:00Z"]] + items = list(collection.get_items(datetime=datetime)) + assert items is not None + assert len(items) == 6 + + def test_get_items_with_both_extents(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + collection = client.get_collection("aster-l1t") + bbox = [[90, -73, 105, -69]] + datetime = [["2006-12-01T00:00:00Z", "2006-12-27T03:00:00Z"]] + items = list(collection.get_items(bbox=bbox, datetime=datetime)) + assert items is not None + assert len(items) == 4 + + def test_get_items_with_multiple_bboxes_and_interval(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + collection = client.get_collection("aster-l1t") + bbox = [ + [90, -73, 105, -69], # Bounding box 1 + [ + -180.0, + -90.0, + -170.0, + -80.0, + ], # Bounding box 2 (non-overlapping with bbox 1) + [ + -100.0, + -72.0, + -90.0, + -62.0, + ], # Bounding box 3 (non-overlapping with bbox 1 and 2) + ] + datetime = [["2006-12-01T00:00:00Z", "2006-12-27T03:00:00Z"]] + items = list(collection.get_items(bbox=bbox, datetime=datetime)) + assert items is not None + assert len(items) == 4 + + def test_get_items_with_ids(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + collection = client.get_collection("aster-l1t") + ids = ["AST_L1T_00312272006020322_20150518201805", "item2", "item3"] + items = list(collection.get_items(*ids)) + assert items is not None + assert len(items) == 1 + for item in items: + assert item.id in ids + + def test_get_items_with_id(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + collection = client.get_collection("aster-l1t") + items = list(collection.get_items("AST_L1T_00312272006020322_20150518201805")) + assert items is not None + assert len(items) == 1 + + def test_get_items_with_bbox_and_non_overlapping_intervals(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + collection = client.get_collection("aster-l1t") + bbox = [[-180.0, -90.0, 180.0, 90.0]] + datetime = [ + ["2006-01-01T00:00:00Z", "2006-06-01T00:00:00Z"], + ["2006-07-01T00:00:00Z", "2007-01-01T00:00:00Z"], + ] + items = list(collection.get_items(bbox=bbox, datetime=datetime)) + assert items is not None + assert len(items) == 10 + + def test_get_items_with_bbox_and_interval(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + collection = client.get_collection("aster-l1t") + bbox = [-180.0, -90.0, 180.0, 90.0] + interval = ["2006-01-01T00:00:00Z", "2007-01-01T00:00:00Z"] + items = list(collection.get_items(bbox=bbox, datetime=interval)) + assert items is not None + assert len(items) > 0 + + def test_get_dataframe_with_bbox_and_interval(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + collection = client.get_collection("aster-l1t") + bbox = [-180.0, -90.0, 180.0, 90.0] + interval = ["2006-01-01T00:00:00Z", "2007-01-01T00:00:00Z"] + df = collection.get_dataframe(bbox=bbox, datetime=interval) + assert df is not None + assert df.count() > 0 + + def test_save_to_geoparquet(self) -> None: + client = Client.open(STAC_URLS["PLANETARY-COMPUTER"]) + collection = client.get_collection("aster-l1t") + + # Create a temporary directory for the output path and clean it up after the test + import tempfile + + with tempfile.TemporaryDirectory() as tmpdirname: + output_path = f"{tmpdirname}/test_geoparquet_output" + + # Define spatial and temporal extents + bbox = [[-180.0, -90.0, 180.0, 90.0]] + datetime = [["2006-01-01T00:00:00Z", "2007-01-01T00:00:00Z"]] + + # Call the method to save the DataFrame to GeoParquet + collection.save_to_geoparquet( + output_path=output_path, bbox=bbox, datetime=datetime + ) + + # Check if the file was created + import os + + assert os.path.exists(output_path), "GeoParquet file was not created" + + # Optionally, you can load the file back and check its contents + df_loaded = collection.spark.read.format("geoparquet").load(output_path) + assert df_loaded.count() == 10, "Loaded GeoParquet file is empty" diff --git a/python/tests/test_base.py b/python/tests/test_base.py index 84f27356f6..2769a93cdd 100644 --- a/python/tests/test_base.py +++ b/python/tests/test_base.py @@ -26,6 +26,7 @@ from sedona.utils.decorators import classproperty SPARK_REMOTE = os.getenv("SPARK_REMOTE") +EXTRA_JARS = os.getenv("SEDONA_PYTHON_EXTRA_JARS") from shapely import wkt from shapely.geometry.base import BaseGeometry @@ -36,6 +37,10 @@ class TestBase: @classproperty def spark(self): if not hasattr(self, "__spark"): + # This lets a caller override the value of SPARK_HOME to just use whatever + # is provided by pyspark. Otherwise, export SPARK_HOME="" has no effect. + if "SPARK_HOME" in os.environ and not os.environ["SPARK_HOME"]: + del os.environ["SPARK_HOME"] builder = SedonaContext.builder() if SPARK_REMOTE: @@ -53,6 +58,11 @@ def spark(self): else: builder = builder.master("local[*]") + # Allows the Sedona .jar to be explicitly set by the caller (e.g, to run + # pytest against a freshly-built development version of Sedona) + if EXTRA_JARS: + builder.config("spark.jars", EXTRA_JARS) + spark = SedonaContext.create(builder.getOrCreate()) if not SPARK_REMOTE: diff --git a/python/tests/utils/test_geoarrow.py b/python/tests/utils/test_geoarrow.py new file mode 100644 index 0000000000..e84dea84f4 --- /dev/null +++ b/python/tests/utils/test_geoarrow.py @@ -0,0 +1,282 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import json + +import pyarrow as pa +import pyproj +import pytest +from tests.test_base import TestBase +from pyspark.sql.types import StringType, StructType +from sedona.utils.geoarrow import ( + dataframe_to_arrow, + unique_srid_from_ewkb, + wrap_geoarrow_field, + wrap_geoarrow_extension, +) + + +class TestGeoArrow(TestBase): + def test_to_geoarrow_no_geometry(self): + schema = StructType().add("wkt", StringType()) + wkt_df = TestGeoArrow.spark.createDataFrame(zip(TEST_WKT), schema) + wkt_table = dataframe_to_arrow(wkt_df) + assert wkt_table == pa.table({"wkt": TEST_WKT}) + + def test_to_geoarrow_with_geometry(self): + schema = StructType().add("wkt", StringType()) + wkt_df = TestGeoArrow.spark.createDataFrame(zip(TEST_WKT), schema) + geo_df = wkt_df.selectExpr("wkt", "ST_GeomFromText(wkt) AS geom") + + geo_table = dataframe_to_arrow(geo_df) + assert geo_table.column_names == ["wkt", "geom"] + + geom = geo_table["geom"] + if isinstance(geom.type, pa.ExtensionType): + assert geom.type.extension_name == "geoarrow.wkb" + assert geom.type.crs is None + else: + field = geo_table.field("geom") + assert field.metadata is not None + assert b"ARROW:extension:name" in field.metadata + assert field.metadata[b"ARROW:extension:name"] == b"geoarrow.wkb" + assert field.metadata[b"ARROW:extension:metadata"] == b"{}" + + def test_to_geoarrow_with_geometry_with_srid(self): + schema = StructType().add("wkt", StringType()) + wkt_df = TestGeoArrow.spark.createDataFrame(zip(TEST_WKT), schema) + geo_df = wkt_df.selectExpr("ST_SetSRID(ST_GeomFromText(wkt), 4326) AS geom") + + geo_table = dataframe_to_arrow(geo_df) + geom = geo_table["geom"] + if isinstance(geom.type, pa.ExtensionType): + assert geom.type.extension_name == "geoarrow.wkb" + # CRS handling in geoarrow-types was updated in 0.2, but this should work for both + assert "EPSG:4326" in repr(geom.type.crs) + else: + field = geo_table.field("geom") + assert field.metadata is not None + assert b"ARROW:extension:name" in field.metadata + assert field.metadata[b"ARROW:extension:name"] == b"geoarrow.wkb" + + metadata = json.loads(field.metadata[b"ARROW:extension:metadata"]) + assert "crs" in metadata + assert "id" in metadata["crs"] + assert metadata["crs"]["id"] == {"authority": "EPSG", "code": 4326} + + def test_wrap_field(self): + col_empty = pa.array([], pa.binary()) + field = pa.field("foofy", col_empty.type) + + # With pyproj object crs override + wrapped = wrap_geoarrow_field(field, col_empty, pyproj.CRS("OGC:CRS84")) + assert wrapped.metadata[b"ARROW:extension:name"] == b"geoarrow.wkb" + assert b"WGS 84 (CRS84)" in wrapped.metadata[b"ARROW:extension:metadata"] + + # With arbitrary string override + wrapped = wrap_geoarrow_field(field, col_empty, "OGC:CRS84") + assert wrapped.metadata[b"ARROW:extension:name"] == b"geoarrow.wkb" + assert b"WGS 84 (CRS84)" in wrapped.metadata[b"ARROW:extension:metadata"] + + # With no output CRS + wrapped = wrap_geoarrow_field(field, col_empty, None) + assert wrapped.metadata[b"ARROW:extension:name"] == b"geoarrow.wkb" + assert wrapped.metadata[b"ARROW:extension:metadata"] == b"{}" + + # With inferred crs + col = pa.array(TEST_WKB["ewkb_srid_little_endian"]) + wrapped = wrap_geoarrow_field(field, col, None) + assert wrapped.metadata[b"ARROW:extension:name"] == b"geoarrow.wkb" + assert b"WGS 84" in wrapped.metadata[b"ARROW:extension:metadata"] + + def test_wrap_extension(self): + gat = pytest.importorskip("geoarrow.types") + + col_empty = pa.array([], pa.binary()) + spec = gat.wkb() + + # With pyproj object crs override + wrapped = wrap_geoarrow_extension(col_empty, spec, pyproj.CRS("OGC:CRS84")) + assert wrapped.type.encoding == gat.Encoding.WKB + assert "WGS 84 (CRS84)" in wrapped.type.crs.to_json() + + # With no output CRS + wrapped = wrap_geoarrow_extension(col_empty, spec, None) + assert wrapped.type.encoding == gat.Encoding.WKB + assert wrapped.type.crs is None + + # With arbitrary string override + wrapped = wrap_geoarrow_extension(col_empty, spec, "OGC:CRS84") + assert wrapped.type.encoding == gat.Encoding.WKB + assert "WGS 84 (CRS84)" in wrapped.type.crs.to_json() + + # With inferred crs + col = pa.array(TEST_WKB["ewkb_srid_little_endian"]) + wrapped = wrap_geoarrow_extension(col, spec, None) + assert wrapped.type.encoding == gat.Encoding.WKB + assert "WGS 84" in wrapped.type.crs.to_json() + + def test_unique_srid(self): + # Zero size should return None + assert unique_srid_from_ewkb(pa.array([], pa.binary())) is None + + # EWKB with no SRID should return None here + assert unique_srid_from_ewkb(pa.array(TEST_WKB["ewkb_little_endian"])) is None + assert unique_srid_from_ewkb(pa.array(TEST_WKB["ewkb_big_endian"])) is None + + # EWKB with SRID + assert ( + unique_srid_from_ewkb(pa.array(TEST_WKB["ewkb_srid_little_endian"])) == 4326 + ) + assert unique_srid_from_ewkb(pa.array(TEST_WKB["ewkb_srid_big_endian"])) == 4326 + + # In the presence of geometries with SRID and without SRID, the geometries + # without SRID are not counted + assert ( + unique_srid_from_ewkb( + pa.array( + TEST_WKB["ewkb_little_endian"] + TEST_WKB["ewkb_srid_little_endian"] + ) + ) + == 4326 + ) + + # If there is more than one SRID present, we return None + # "SRID=1234;POINT (10 20)" + ewkb_other_srid = b"\x01\x01\x00\x00 \xd2\x04\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@" + assert ( + unique_srid_from_ewkb( + pa.array(TEST_WKB["ewkb_srid_little_endian"] + [ewkb_other_srid]) + ) + is None + ) + + # Mixed endian (unlikely) should return None + assert ( + unique_srid_from_ewkb( + pa.array( + TEST_WKB["ewkb_srid_big_endian"] + + TEST_WKB["ewkb_srid_little_endian"] + ) + ) + is None + ) + + +TEST_WKT = [ + "POINT (10 20)", + "POINT (10 20 30)", + "LINESTRING (10 20, 30 40)", + "LINESTRING (10 20 30, 40 50 60)", + "POLYGON ((10 10, 20 20, 20 10, 10 10))", + "POLYGON ((10 10 10, 20 20 10, 20 10 10, 10 10 10))", + "POLYGON ((0 0, 0 10, 10 10, 10 0, 0 0), (1 1, 1 2, 2 2, 2 1, 1 1))", +] + +# A little tricky to generate EWKB test data because shapely/GEOS +# does not support M values. In R we have two options that can do this +# (sf and wk), so the test data was generated there. We need test data +# with all dimensions because this affects the high bytes used to detect +# the presence of an SRID; we need both endians because this affects the +# decoding of the high bits and the SRID value. +# +# ewkt <- c( +# "POINT (10 20)", +# "POINT Z (10 20 2)", +# "POINT M (10 20 2)", +# "POINT ZM (10 20 2 3)", +# "POINT (10 20)", +# "LINESTRING (10 20, 30 40)", +# "POLYGON ((10 10, 20 20, 20 10, 10 10))", +# "MULTIPOINT ((10 20))", +# "MULTILINESTRING ((10 20, 30 40))", +# "MULTIPOLYGON (((10 10, 20 20, 20 10, 10 10)))", +# "GEOMETRYCOLLECTION (POINT (10 20))" +# ) +# +# ewkt_srid <- paste0("SRID=4326;", ewkt) +# +# ewkb_test <- list( +# ewkb_little_endian = wk::wkt_translate_wkb(ewkt, endian = 1), +# ewkb_big_endian = wk::wkt_translate_wkb(ewkt, endian = 0), +# ewkb_srid_little_endian = wk::wkt_translate_wkb(ewkt_srid, endian = 1), +# ewkb_srid_big_endian = wk::wkt_translate_wkb(ewkt_srid, endian = 0) +# ) +# +# # Generate a version of this we can paste into Python, with a bit of +# # indirection because reticulate converts raw vectors as bytearray +# py <- reticulate::py +# py$ekwb <- ewkb_test +# py_ewkb <- reticulate::py_eval( +# "{k: [bytes(i) for i in v] for k, v in ekwb.items()}", +# convert = FALSE +# ) +# print(py_ewkb) +TEST_WKB = { + "ewkb_little_endian": [ + b"\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@", + b"\x01\x01\x00\x00\x80\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x00\x00@", + b"\x01\x01\x00\x00@\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x00\x00@", + b"\x01\x01\x00\x00\xc0\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x08@", + b"\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@", + b"\x01\x02\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x00>@\x00\x00\x00\x00\x00\x00D@", + b"\x01\x03\x00\x00\x00\x01\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x00$@", + b"\x01\x04\x00\x00\x00\x01\x00\x00\x00\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@", + b"\x01\x05\x00\x00\x00\x01\x00\x00\x00\x01\x02\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x00>@\x00\x00\x00\x00\x00\x00D@", + b"\x01\x06\x00\x00\x00\x01\x00\x00\x00\x01\x03\x00\x00\x00\x01\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x00$@", + b"\x01\x07\x00\x00\x00\x01\x00\x00\x00\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@", + ], + "ewkb_big_endian": [ + b"\x00\x00\x00\x00\x01@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00", + b"\x00\x80\x00\x00\x01@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00", + b"\x00@\x00\x00\x01@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00", + b"\x00\xc0\x00\x00\x01@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00@\x08\x00\x00\x00\x00\x00\x00", + b"\x00\x00\x00\x00\x01@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00", + b"\x00\x00\x00\x00\x02\x00\x00\x00\x02@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@>\x00\x00\x00\x00\x00\x00@D\x00\x00\x00\x00\x00\x00", + b"\x00\x00\x00\x00\x03\x00\x00\x00\x01\x00\x00\x00\x04@$\x00\x00\x00\x00\x00\x00@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@$\x00\x00\x00\x00\x00\x00@$\x00\x00\x00\x00\x00\x00@$\x00\x00\x00\x00\x00\x00", + b"\x00\x00\x00\x00\x04\x00\x00\x00\x01\x00\x00\x00\x00\x01@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00", + b"\x00\x00\x00\x00\x05\x00\x00\x00\x01\x00\x00\x00\x00\x02\x00\x00\x00\x02@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@>\x00\x00\x00\x00\x00\x00@D\x00\x00\x00\x00\x00\x00", + b"\x00\x00\x00\x00\x06\x00\x00\x00\x01\x00\x00\x00\x00\x03\x00\x00\x00\x01\x00\x00\x00\x04@$\x00\x00\x00\x00\x00\x00@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@$\x00\x00\x00\x00\x00\x00@$\x00\x00\x00\x00\x00\x00@$\x00\x00\x00\x00\x00\x00", + b"\x00\x00\x00\x00\x07\x00\x00\x00\x01\x00\x00\x00\x00\x01@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00", + ], + "ewkb_srid_little_endian": [ + b"\x01\x01\x00\x00 \xe6\x10\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@", + b"\x01\x01\x00\x00\xa0\xe6\x10\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x00\x00@", + b"\x01\x01\x00\x00`\xe6\x10\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x00\x00@", + b"\x01\x01\x00\x00\xe0\xe6\x10\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x08@", + b"\x01\x01\x00\x00 \xe6\x10\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@", + b"\x01\x02\x00\x00 \xe6\x10\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x00>@\x00\x00\x00\x00\x00\x00D@", + b"\x01\x03\x00\x00 \xe6\x10\x00\x00\x01\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x00$@", + b"\x01\x04\x00\x00 \xe6\x10\x00\x00\x01\x00\x00\x00\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@", + b"\x01\x05\x00\x00 \xe6\x10\x00\x00\x01\x00\x00\x00\x01\x02\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x00>@\x00\x00\x00\x00\x00\x00D@", + b"\x01\x06\x00\x00 \xe6\x10\x00\x00\x01\x00\x00\x00\x01\x03\x00\x00\x00\x01\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x00$@", + b"\x01\x07\x00\x00 \xe6\x10\x00\x00\x01\x00\x00\x00\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x004@", + ], + "ewkb_srid_big_endian": [ + b"\x00 \x00\x00\x01\x00\x00\x10\xe6@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00", + b"\x00\xa0\x00\x00\x01\x00\x00\x10\xe6@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00", + b"\x00`\x00\x00\x01\x00\x00\x10\xe6@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00", + b"\x00\xe0\x00\x00\x01\x00\x00\x10\xe6@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00@\x08\x00\x00\x00\x00\x00\x00", + b"\x00 \x00\x00\x01\x00\x00\x10\xe6@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00", + b"\x00 \x00\x00\x02\x00\x00\x10\xe6\x00\x00\x00\x02@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@>\x00\x00\x00\x00\x00\x00@D\x00\x00\x00\x00\x00\x00", + b"\x00 \x00\x00\x03\x00\x00\x10\xe6\x00\x00\x00\x01\x00\x00\x00\x04@$\x00\x00\x00\x00\x00\x00@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@$\x00\x00\x00\x00\x00\x00@$\x00\x00\x00\x00\x00\x00@$\x00\x00\x00\x00\x00\x00", + b"\x00 \x00\x00\x04\x00\x00\x10\xe6\x00\x00\x00\x01\x00\x00\x00\x00\x01@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00", + b"\x00 \x00\x00\x05\x00\x00\x10\xe6\x00\x00\x00\x01\x00\x00\x00\x00\x02\x00\x00\x00\x02@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@>\x00\x00\x00\x00\x00\x00@D\x00\x00\x00\x00\x00\x00", + b"\x00 \x00\x00\x06\x00\x00\x10\xe6\x00\x00\x00\x01\x00\x00\x00\x00\x03\x00\x00\x00\x01\x00\x00\x00\x04@$\x00\x00\x00\x00\x00\x00@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00@$\x00\x00\x00\x00\x00\x00@$\x00\x00\x00\x00\x00\x00@$\x00\x00\x00\x00\x00\x00", + b"\x00 \x00\x00\x07\x00\x00\x10\xe6\x00\x00\x00\x01\x00\x00\x00\x00\x01@$\x00\x00\x00\x00\x00\x00@4\x00\x00\x00\x00\x00\x00", + ], +} diff --git a/snowflake-tester/pom.xml b/snowflake-tester/pom.xml index 7db5e569b2..ada015c2be 100644 --- a/snowflake-tester/pom.xml +++ b/snowflake-tester/pom.xml @@ -24,7 +24,7 @@ org.apache.sedona sedona-parent - 1.6.1-SNAPSHOT + 1.7.1-SNAPSHOT ../pom.xml @@ -53,7 +53,7 @@ net.snowflake snowflake-jdbc - 3.13.30 + 3.22.0 diff --git a/snowflake-tester/src/test/java/org/apache/sedona/snowflake/snowsql/TestFunctions.java b/snowflake-tester/src/test/java/org/apache/sedona/snowflake/snowsql/TestFunctions.java index ed319f92bc..e4ffafab3f 100644 --- a/snowflake-tester/src/test/java/org/apache/sedona/snowflake/snowsql/TestFunctions.java +++ b/snowflake-tester/src/test/java/org/apache/sedona/snowflake/snowsql/TestFunctions.java @@ -81,6 +81,25 @@ public void test_ST_Affine() { "POINT (2 2)"); } + @Test + public void test_ST_LabelPoint() { + registerUDF("ST_LabelPoint", byte[].class, int.class, double.class); + registerUDF("ST_ReducePrecision", byte[].class, int.class); + verifySqlSingleRes( + "SELECT sedona.ST_AsText(sedona.ST_ReducePrecision(sedona.ST_LabelPoint(sedona.ST_GeomFromText('POLYGON ((-112.637484 33.440546, -112.546852 33.477209, -112.489177 33.550488, -112.41777 33.751684, -111.956371 33.719707, -111.766868 33.616843, -111.775107 33.527595, -111.640533 33.504695, -111.440044 33.463462, -111.415326 33.374055, -111.514197 33.309809, -111.643279 33.222542, -111.893203 33.174278, -111.96461 33.250109, -112.123903 33.261593, -112.252985 33.35341, -112.406784 33.346527, -112.667694 33.316695, -112.637484 33.440546))'), 2, 0.2), 4))", + "POINT (-112.0428 33.4642)"); + registerUDF("ST_LabelPoint", byte[].class, int.class); + registerUDF("ST_ReducePrecision", byte[].class, int.class); + verifySqlSingleRes( + "SELECT sedona.ST_AsText(sedona.ST_ReducePrecision(sedona.ST_LabelPoint(sedona.ST_GeomFromText('GEOMETRYCOLLECTION(POLYGON ((-112.840785 33.435962, -112.840785 33.708284, -112.409597 33.708284, -112.409597 33.435962, -112.840785 33.435962)), POLYGON ((-112.309264 33.398167, -112.309264 33.746007, -111.787444 33.746007, -111.787444 33.398167, -112.309264 33.398167)))'), 4), 4))", + "POINT (-112.0484 33.5721)"); + registerUDF("ST_LabelPoint", byte[].class); + registerUDF("ST_ReducePrecision", byte[].class, int.class); + verifySqlSingleRes( + "SELECT sedona.ST_AsText(sedona.ST_ReducePrecision(sedona.ST_LabelPoint(sedona.ST_GeomFromText('POLYGON ((-112.654072 33.114485, -112.313516 33.653431, -111.63515 33.314399, -111.497829 33.874913, -111.692825 33.431378, -112.376684 33.788215, -112.654072 33.114485))')), 4))", + "POINT (-112.0723 33.5391)"); + } + @Test public void test_ST_Angle() { registerUDF("ST_Angle", byte[].class, byte[].class); @@ -788,6 +807,25 @@ public void test_ST_Perimeter() { 2216861.0); } + @Test + public void test_ST_Perimeter2D() { + registerUDF("ST_Perimeter2D", byte[].class); + verifySqlSingleRes( + "SELECT sedona.ST_Perimeter2D(sedona.ST_GeomFromText('POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))'))", + 20.0); + + registerUDF("ST_Perimeter2D", byte[].class, boolean.class); + verifySqlSingleRes( + "SELECT CEIL(sedona.ST_Perimeter2D(sedona.ST_GeomFromText('POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))'), true))", + 2216861.0); + + registerUDF("ST_Perimeter2D", byte[].class, boolean.class, boolean.class); + registerUDF("ST_GeomFromText", String.class, int.class); + verifySqlSingleRes( + "SELECT CEIL(sedona.ST_Perimeter2D(sedona.ST_GeomFromText('POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))', 4326), true, false))", + 2216861.0); + } + @Test public void test_ST_PointOnSurface() { registerUDF("ST_PointOnSurface", byte[].class); diff --git a/snowflake-tester/src/test/java/org/apache/sedona/snowflake/snowsql/TestFunctionsV2.java b/snowflake-tester/src/test/java/org/apache/sedona/snowflake/snowsql/TestFunctionsV2.java index cfcdddcd20..30488f9fae 100644 --- a/snowflake-tester/src/test/java/org/apache/sedona/snowflake/snowsql/TestFunctionsV2.java +++ b/snowflake-tester/src/test/java/org/apache/sedona/snowflake/snowsql/TestFunctionsV2.java @@ -80,6 +80,23 @@ public void test_ST_Affine() { "POINT(2 2)"); } + @Test + public void test_ST_LabelPoint() { + registerUDFV2("ST_LabelPoint", String.class, int.class, double.class); + registerUDFV2("ST_ReducePrecision", String.class, int.class); + verifySqlSingleRes( + "SELECT ST_AsText(sedona.ST_ReducePrecision(sedona.ST_LabelPoint(ST_GeometryFromWKT('POLYGON ((-112.637484 33.440546, -112.546852 33.477209, -112.489177 33.550488, -112.41777 33.751684, -111.956371 33.719707, -111.766868 33.616843, -111.775107 33.527595, -111.640533 33.504695, -111.440044 33.463462, -111.415326 33.374055, -111.514197 33.309809, -111.643279 33.222542, -111.893203 33.174278, -111.96461 33.250109, -112.123903 33.261593, -112.252985 33.35341, -112.406784 33.346527, -112.667694 33.316695, -112.637484 33.440546))'), 2, 0.2), 4))", + "POINT(-112.0428 33.4642)"); + registerUDFV2("ST_LabelPoint", String.class, int.class); + verifySqlSingleRes( + "SELECT ST_AsText(sedona.ST_ReducePrecision(sedona.ST_LabelPoint(ST_GeometryFromWKT('GEOMETRYCOLLECTION(POLYGON ((-112.840785 33.435962, -112.840785 33.708284, -112.409597 33.708284, -112.409597 33.435962, -112.840785 33.435962)), POLYGON ((-112.309264 33.398167, -112.309264 33.746007, -111.787444 33.746007, -111.787444 33.398167, -112.309264 33.398167)))'), 4), 4))", + "POINT(-112.0484 33.5721)"); + registerUDFV2("ST_LabelPoint", String.class); + verifySqlSingleRes( + "SELECT ST_AsText(sedona.ST_ReducePrecision(sedona.ST_LabelPoint(ST_GeometryFromWKT('POLYGON ((-112.654072 33.114485, -112.313516 33.653431, -111.63515 33.314399, -111.497829 33.874913, -111.692825 33.431378, -112.376684 33.788215, -112.654072 33.114485))')), 4))", + "POINT(-112.0723 33.5391)"); + } + @Test public void test_ST_Angle() { registerUDFV2("ST_Angle", String.class, String.class); @@ -731,6 +748,19 @@ public void test_ST_Perimeter() { 2216861.0); } + @Test + public void test_ST_Perimeter2D() { + registerUDFV2("ST_Perimeter2D", String.class); + verifySqlSingleRes( + "SELECT sedona.ST_Perimeter2D(ST_GeomFromText('POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))'))", + 20.0); + + registerUDFV2("ST_Perimeter2D", String.class, boolean.class); + verifySqlSingleRes( + "SELECT CEIL(sedona.ST_Perimeter2D(ST_GeomFromText('POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))'), true))", + 2216861.0); + } + @Test public void test_ST_PointOnSurface() { registerUDFV2("ST_PointOnSurface", String.class); diff --git a/snowflake/src/main/java/org/apache/sedona/snowflake/snowsql/UDFs.java b/snowflake/src/main/java/org/apache/sedona/snowflake/snowsql/UDFs.java index 368cdb3e4b..98fdbb0608 100644 --- a/snowflake/src/main/java/org/apache/sedona/snowflake/snowsql/UDFs.java +++ b/snowflake/src/main/java/org/apache/sedona/snowflake/snowsql/UDFs.java @@ -87,6 +87,23 @@ public static double ST_Angle(byte[] geom1, byte[] geom2) { return Functions.angle(GeometrySerde.deserialize(geom1), GeometrySerde.deserialize(geom2)); } + @UDFAnnotations.ParamMeta(argNames = {"geom"}) + public static byte[] ST_LabelPoint(byte[] geom) { + return GeometrySerde.serialize(Functions.labelPoint(GeometrySerde.deserialize(geom))); + } + + @UDFAnnotations.ParamMeta(argNames = {"geom", "gridResolution"}) + public static byte[] ST_LabelPoint(byte[] geom, int gridResolution) { + return GeometrySerde.serialize( + Functions.labelPoint(GeometrySerde.deserialize(geom), gridResolution)); + } + + @UDFAnnotations.ParamMeta(argNames = {"geom", "gridResolution", "goodnessThreshold"}) + public static byte[] ST_LabelPoint(byte[] geom, int gridResolution, double goodnessThreshold) { + return GeometrySerde.serialize( + Functions.labelPoint(GeometrySerde.deserialize(geom), gridResolution, goodnessThreshold)); + } + @UDFAnnotations.ParamMeta(argNames = {"geom1", "geom2", "geom3"}) public static double ST_Angle(byte[] geom1, byte[] geom2, byte[] geom3) { return Functions.angle( @@ -822,6 +839,21 @@ public static double ST_Perimeter(byte[] geometry, boolean use_spheroid, boolean return Functions.perimeter(GeometrySerde.deserialize(geometry), use_spheroid, lenient); } + @UDFAnnotations.ParamMeta(argNames = {"geometry"}) + public static double ST_Perimeter2D(byte[] geometry) { + return Functions.perimeter(GeometrySerde.deserialize(geometry)); + } + + @UDFAnnotations.ParamMeta(argNames = {"geometry", "use_spheroid"}) + public static double ST_Perimeter2D(byte[] geometry, boolean use_spheroid) { + return Functions.perimeter(GeometrySerde.deserialize(geometry), use_spheroid); + } + + @UDFAnnotations.ParamMeta(argNames = {"geometry", "use_spheroid", "lenient"}) + public static double ST_Perimeter2D(byte[] geometry, boolean use_spheroid, boolean lenient) { + return Functions.perimeter(GeometrySerde.deserialize(geometry), use_spheroid, lenient); + } + @UDFAnnotations.ParamMeta(argNames = {"geometry"}) public static byte[] ST_PointOnSurface(byte[] geometry) { return GeometrySerde.serialize(Functions.pointOnSurface(GeometrySerde.deserialize(geometry))); diff --git a/snowflake/src/main/java/org/apache/sedona/snowflake/snowsql/UDFsV2.java b/snowflake/src/main/java/org/apache/sedona/snowflake/snowsql/UDFsV2.java index a8e6e2efd6..2d9b349b18 100644 --- a/snowflake/src/main/java/org/apache/sedona/snowflake/snowsql/UDFsV2.java +++ b/snowflake/src/main/java/org/apache/sedona/snowflake/snowsql/UDFsV2.java @@ -101,6 +101,32 @@ public static String ST_Affine( Functions.affine(GeometrySerde.deserGeoJson(geometry), a, b, d, e, xOff, yOff)); } + @UDFAnnotations.ParamMeta( + argNames = {"geom"}, + argTypes = {"Geometry"}, + returnTypes = "Geometry") + public static String ST_LabelPoint(String geom) { + return GeometrySerde.serGeoJson(Functions.labelPoint(GeometrySerde.deserGeoJson(geom))); + } + + @UDFAnnotations.ParamMeta( + argNames = {"geom", "gridResolution"}, + argTypes = {"Geometry", "int"}, + returnTypes = "Geometry") + public static String ST_LabelPoint(String geom, int gridResolution) { + return GeometrySerde.serGeoJson( + Functions.labelPoint(GeometrySerde.deserGeoJson(geom), gridResolution)); + } + + @UDFAnnotations.ParamMeta( + argNames = {"geom", "gridResolution", "goodnessThreshold"}, + argTypes = {"Geometry", "int", "double"}, + returnTypes = "Geometry") + public static String ST_LabelPoint(String geom, int gridResolution, double goodnessThreshold) { + return GeometrySerde.serGeoJson( + Functions.labelPoint(GeometrySerde.deserGeoJson(geom), gridResolution, goodnessThreshold)); + } + @UDFAnnotations.ParamMeta( argNames = {"geom1", "geom2"}, argTypes = {"Geometry", "Geometry"}) @@ -963,6 +989,30 @@ public static double ST_Perimeter(String geometry, boolean use_spheroid, boolean return Functions.perimeter(GeometrySerde.deserGeoJson(geometry), use_spheroid, lenient); } + @UDFAnnotations.ParamMeta( + argNames = {"geometry"}, + argTypes = {"Geometry"}, + returnTypes = "double") + public static double ST_Perimeter2D(String geometry) { + return Functions.perimeter(GeometrySerde.deserGeoJson(geometry)); + } + + @UDFAnnotations.ParamMeta( + argNames = {"geometry", "use_spheroid"}, + argTypes = {"Geometry", "boolean"}, + returnTypes = "double") + public static double ST_Perimeter2D(String geometry, boolean use_spheroid) { + return Functions.perimeter(GeometrySerde.deserGeoJson(geometry), use_spheroid); + } + + @UDFAnnotations.ParamMeta( + argNames = {"geometry", "use_spheroid", "lenient"}, + argTypes = {"Geometry", "boolean", "boolean"}, + returnTypes = "double") + public static double ST_Perimeter2D(String geometry, boolean use_spheroid, boolean lenient) { + return Functions.perimeter(GeometrySerde.deserGeoJson(geometry), use_spheroid, lenient); + } + @UDFAnnotations.ParamMeta( argNames = {"geometry"}, argTypes = {"Geometry"}, diff --git a/spark/common/pom.xml b/spark/common/pom.xml index 7803a93275..9014c9d7cc 100644 --- a/spark/common/pom.xml +++ b/spark/common/pom.xml @@ -220,6 +220,12 @@ + + org.apache.spark + spark-graphx_${scala.compat.version} + ${spark.version} + test + src/main/java diff --git a/spark/common/src/main/java/org/apache/sedona/core/joinJudgement/KnnJoinIndexJudgement.java b/spark/common/src/main/java/org/apache/sedona/core/joinJudgement/KnnJoinIndexJudgement.java index 1c7fe7a0ae..f5375009ed 100644 --- a/spark/common/src/main/java/org/apache/sedona/core/joinJudgement/KnnJoinIndexJudgement.java +++ b/spark/common/src/main/java/org/apache/sedona/core/joinJudgement/KnnJoinIndexJudgement.java @@ -25,6 +25,7 @@ import org.apache.sedona.core.knnJudgement.EuclideanItemDistance; import org.apache.sedona.core.knnJudgement.HaversineItemDistance; import org.apache.sedona.core.knnJudgement.SpheroidDistance; +import org.apache.sedona.core.wrapper.UniqueGeometry; import org.apache.spark.api.java.function.FlatMapFunction2; import org.apache.spark.broadcast.Broadcast; import org.apache.spark.util.LongAccumulator; @@ -46,35 +47,43 @@ public class KnnJoinIndexJudgement extends JudgementBase implements FlatMapFunction2, Iterator, Pair>, Serializable { private final int k; + private final Double searchRadius; private final DistanceMetric distanceMetric; private final boolean includeTies; - private final Broadcast broadcastedTreeIndex; + private final Broadcast broadcastQueryObjects; + private final Broadcast broadcastObjectsTreeIndex; /** * Constructor for the KnnJoinIndexJudgement class. * * @param k the number of nearest neighbors to find + * @param searchRadius * @param distanceMetric the distance metric to use + * @param broadcastQueryObjects the broadcast geometries on queries + * @param broadcastObjectsTreeIndex the broadcast spatial index on objects * @param buildCount accumulator for the number of geometries processed from the build side * @param streamCount accumulator for the number of geometries processed from the stream side * @param resultCount accumulator for the number of join results * @param candidateCount accumulator for the number of candidate matches - * @param broadcastedTreeIndex the broadcasted spatial index */ public KnnJoinIndexJudgement( int k, + Double searchRadius, DistanceMetric distanceMetric, boolean includeTies, - Broadcast broadcastedTreeIndex, + Broadcast broadcastQueryObjects, + Broadcast broadcastObjectsTreeIndex, LongAccumulator buildCount, LongAccumulator streamCount, LongAccumulator resultCount, LongAccumulator candidateCount) { super(null, buildCount, streamCount, resultCount, candidateCount); this.k = k; + this.searchRadius = searchRadius; this.distanceMetric = distanceMetric; this.includeTies = includeTies; - this.broadcastedTreeIndex = broadcastedTreeIndex; + this.broadcastQueryObjects = broadcastQueryObjects; + this.broadcastObjectsTreeIndex = broadcastObjectsTreeIndex; } /** @@ -90,7 +99,7 @@ public KnnJoinIndexJudgement( @Override public Iterator> call(Iterator streamShapes, Iterator treeIndexes) throws Exception { - if (!treeIndexes.hasNext() || !streamShapes.hasNext()) { + if (!treeIndexes.hasNext() || (streamShapes != null && !streamShapes.hasNext())) { buildCount.add(0); streamCount.add(0); resultCount.add(0); @@ -99,10 +108,9 @@ public Iterator> call(Iterator streamShapes, Iterator> call(Iterator streamShapes, Iterator> result = new ArrayList<>(); - ItemDistance itemDistance; - while (streamShapes.hasNext()) { - T streamShape = streamShapes.next(); - streamCount.add(1); - - Object[] localK; - switch (distanceMetric) { - case EUCLIDEAN: - itemDistance = new EuclideanItemDistance(); - break; - case HAVERSINE: - itemDistance = new HaversineItemDistance(); - break; - case SPHEROID: - itemDistance = new SpheroidDistance(); - break; - default: - itemDistance = new GeometryItemDistance(); - break; - } + List queryItems; + if (broadcastQueryObjects != null) { + // get the broadcast spatial index on queries side if available + queryItems = broadcastQueryObjects.getValue(); + for (Object item : queryItems) { + T queryGeom; + if (item instanceof UniqueGeometry) { + queryGeom = (T) ((UniqueGeometry) item).getOriginalGeometry(); + } else { + queryGeom = (T) item; + } + streamCount.add(1); - localK = - strTree.nearestNeighbour(streamShape.getEnvelopeInternal(), streamShape, itemDistance, k); - if (includeTies) { - localK = getUpdatedLocalKWithTies(streamShape, localK, strTree); + Object[] localK = + strTree.nearestNeighbour( + queryGeom.getEnvelopeInternal(), queryGeom, getItemDistance(), k); + if (includeTies) { + localK = getUpdatedLocalKWithTies(queryGeom, localK, strTree); + } + if (searchRadius != null) { + localK = getInSearchRadius(localK, queryGeom); + } + + for (Object obj : localK) { + T candidate = (T) obj; + Pair pair = Pair.of((U) item, candidate); + result.add(pair); + resultCount.add(1); + } } + return result.iterator(); + } else { + while (streamShapes.hasNext()) { + T streamShape = streamShapes.next(); + streamCount.add(1); + + Object[] localK = + strTree.nearestNeighbour( + streamShape.getEnvelopeInternal(), streamShape, getItemDistance(), k); + if (includeTies) { + localK = getUpdatedLocalKWithTies(streamShape, localK, strTree); + } + if (searchRadius != null) { + localK = getInSearchRadius(localK, streamShape); + } - for (Object obj : localK) { - T candidate = (T) obj; - Pair pair = Pair.of((U) streamShape, candidate); - result.add(pair); - resultCount.add(1); + for (Object obj : localK) { + T candidate = (T) obj; + Pair pair = Pair.of((U) streamShape, candidate); + result.add(pair); + resultCount.add(1); + } } + return result.iterator(); } + } - return result.iterator(); + private Object[] getInSearchRadius(Object[] localK, T queryGeom) { + localK = + Arrays.stream(localK) + .filter( + candidate -> { + Geometry candidateGeom = (Geometry) candidate; + return distanceByMetric(queryGeom, candidateGeom, distanceMetric) <= searchRadius; + }) + .toArray(); + return localK; + } + + /** + * This method calculates the distance between two geometries using the specified distance metric. + * + * @param queryGeom the query geometry + * @param candidateGeom the candidate geometry + * @param distanceMetric the distance metric to use + * @return the distance between the two geometries + */ + public static double distanceByMetric( + Geometry queryGeom, Geometry candidateGeom, DistanceMetric distanceMetric) { + switch (distanceMetric) { + case EUCLIDEAN: + EuclideanItemDistance euclideanItemDistance = new EuclideanItemDistance(); + return euclideanItemDistance.distance(queryGeom, candidateGeom); + case HAVERSINE: + HaversineItemDistance haversineItemDistance = new HaversineItemDistance(); + return haversineItemDistance.distance(queryGeom, candidateGeom); + case SPHEROID: + SpheroidDistance spheroidDistance = new SpheroidDistance(); + return spheroidDistance.distance(queryGeom, candidateGeom); + default: + return queryGeom.distance(candidateGeom); + } + } + + private ItemDistance getItemDistance() { + ItemDistance itemDistance; + itemDistance = getItemDistanceByMetric(distanceMetric); + return itemDistance; + } + + /** + * This method returns the ItemDistance object based on the specified distance metric. + * + * @param distanceMetric the distance metric to use + * @return the ItemDistance object + */ + public static ItemDistance getItemDistanceByMetric(DistanceMetric distanceMetric) { + ItemDistance itemDistance; + switch (distanceMetric) { + case EUCLIDEAN: + itemDistance = new EuclideanItemDistance(); + break; + case HAVERSINE: + itemDistance = new HaversineItemDistance(); + break; + case SPHEROID: + itemDistance = new SpheroidDistance(); + break; + default: + itemDistance = new GeometryItemDistance(); + break; + } + return itemDistance; } private Object[] getUpdatedLocalKWithTies(T streamShape, Object[] localK, STRtree strTree) { @@ -184,4 +281,18 @@ private Object[] getUpdatedLocalKWithTies(T streamShape, Object[] localK, STRtre } return localK; } + + public static double distance( + U key, T value, DistanceMetric distanceMetric) { + switch (distanceMetric) { + case EUCLIDEAN: + return new EuclideanItemDistance().distance(key, value); + case HAVERSINE: + return new HaversineItemDistance().distance(key, value); + case SPHEROID: + return new SpheroidDistance().distance(key, value); + default: + return new EuclideanItemDistance().distance(key, value); + } + } } diff --git a/spark/common/src/main/java/org/apache/sedona/core/knnJudgement/EuclideanItemDistance.java b/spark/common/src/main/java/org/apache/sedona/core/knnJudgement/EuclideanItemDistance.java index a27bf543b1..1aba8f87f7 100644 --- a/spark/common/src/main/java/org/apache/sedona/core/knnJudgement/EuclideanItemDistance.java +++ b/spark/common/src/main/java/org/apache/sedona/core/knnJudgement/EuclideanItemDistance.java @@ -36,4 +36,12 @@ public double distance(ItemBoundable item1, ItemBoundable item2) { return g1.distance(g2); } } + + public double distance(Geometry geometry1, Geometry geometry2) { + if (geometry1 == geometry2) { + return Double.MAX_VALUE; + } else { + return geometry1.distance(geometry2); + } + } } diff --git a/spark/common/src/main/java/org/apache/sedona/core/knnJudgement/HaversineItemDistance.java b/spark/common/src/main/java/org/apache/sedona/core/knnJudgement/HaversineItemDistance.java index 9ad1bfbee4..b04627074e 100644 --- a/spark/common/src/main/java/org/apache/sedona/core/knnJudgement/HaversineItemDistance.java +++ b/spark/common/src/main/java/org/apache/sedona/core/knnJudgement/HaversineItemDistance.java @@ -37,4 +37,12 @@ public double distance(ItemBoundable item1, ItemBoundable item2) { return Haversine.distance(g1, g2); } } + + public double distance(Geometry geometry1, Geometry geometry2) { + if (geometry1 == geometry2) { + return Double.MAX_VALUE; + } else { + return Haversine.distance(geometry1, geometry2); + } + } } diff --git a/spark/common/src/main/java/org/apache/sedona/core/knnJudgement/SpheroidDistance.java b/spark/common/src/main/java/org/apache/sedona/core/knnJudgement/SpheroidDistance.java index df22d3565e..4ecdbf84c6 100644 --- a/spark/common/src/main/java/org/apache/sedona/core/knnJudgement/SpheroidDistance.java +++ b/spark/common/src/main/java/org/apache/sedona/core/knnJudgement/SpheroidDistance.java @@ -37,4 +37,12 @@ public double distance(ItemBoundable item1, ItemBoundable item2) { return Spheroid.distance(g1, g2); } } + + public double distance(Geometry geometry1, Geometry geometry2) { + if (geometry1 == geometry2) { + return Double.MAX_VALUE; + } else { + return Spheroid.distance(geometry1, geometry2); + } + } } diff --git a/spark/common/src/main/java/org/apache/sedona/core/spatialOperator/JoinQuery.java b/spark/common/src/main/java/org/apache/sedona/core/spatialOperator/JoinQuery.java index d20563d279..a5665726e0 100644 --- a/spark/common/src/main/java/org/apache/sedona/core/spatialOperator/JoinQuery.java +++ b/spark/common/src/main/java/org/apache/sedona/core/spatialOperator/JoinQuery.java @@ -18,10 +18,7 @@ */ package org.apache.sedona.core.spatialOperator; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Objects; +import java.util.*; import org.apache.commons.lang3.tuple.Pair; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -35,15 +32,18 @@ import org.apache.sedona.core.spatialPartitioning.SpatialPartitioner; import org.apache.sedona.core.spatialRDD.CircleRDD; import org.apache.sedona.core.spatialRDD.SpatialRDD; +import org.apache.sedona.core.wrapper.UniqueGeometry; import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.broadcast.Broadcast; import org.apache.spark.util.LongAccumulator; import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.index.SpatialIndex; import org.locationtech.jts.index.strtree.STRtree; import scala.Tuple2; @@ -784,47 +784,82 @@ public static JavaPairRDD knnJoin LongAccumulator resultCount = Metrics.createMetric(sparkContext, "resultCount"); LongAccumulator candidateCount = Metrics.createMetric(sparkContext, "candidateCount"); - final Broadcast broadcastedTreeIndex; - if (broadcastJoin) { - // adjust auto broadcast threshold to avoid building index on large RDDs + final Broadcast broadcastObjectsTreeIndex; + final Broadcast broadcastQueryObjects; + if (broadcastJoin && objectRDD.indexedRawRDD != null && objectRDD.indexedRDD == null) { + // If broadcastJoin is true and rawIndex is created on object side + // we will broadcast queryRDD to objectRDD + List> uniqueQueryObjects = new ArrayList<>(); + for (U queryObject : queryRDD.rawSpatialRDD.collect()) { + // Wrap the query objects in a UniqueGeometry object to count for duplicate queries in the + // join + uniqueQueryObjects.add(new UniqueGeometry<>(queryObject)); + } + broadcastQueryObjects = + JavaSparkContext.fromSparkContext(sparkContext).broadcast(uniqueQueryObjects); + broadcastObjectsTreeIndex = null; + } else if (broadcastJoin && objectRDD.indexedRawRDD == null && objectRDD.indexedRDD == null) { + // If broadcastJoin is true and index and rawIndex are NOT created on object side + // we will broadcast objectRDD to queryRDD STRtree strTree = objectRDD.coalesceAndBuildRawIndex(IndexType.RTREE); - broadcastedTreeIndex = JavaSparkContext.fromSparkContext(sparkContext).broadcast(strTree); + broadcastObjectsTreeIndex = + JavaSparkContext.fromSparkContext(sparkContext).broadcast(strTree); + broadcastQueryObjects = null; } else { - broadcastedTreeIndex = null; + // Regular join does not need to set broadcast inderx + broadcastQueryObjects = null; + broadcastObjectsTreeIndex = null; } // The reason for using objectRDD as the right side is that the partitions are built on the // right side. final JavaRDD> joinResult; - if (objectRDD.indexedRDD != null) { + if (broadcastObjectsTreeIndex == null && broadcastQueryObjects == null) { + // no broadcast join final KnnJoinIndexJudgement judgement = new KnnJoinIndexJudgement( joinParams.k, + joinParams.searchRadius, joinParams.distanceMetric, includeTies, - broadcastedTreeIndex, + null, + null, buildCount, streamCount, resultCount, candidateCount); joinResult = queryRDD.spatialPartitionedRDD.zipPartitions(objectRDD.indexedRDD, judgement); - } else if (broadcastedTreeIndex != null) { + } else if (broadcastObjectsTreeIndex != null) { + // broadcast join with objectRDD as broadcast side final KnnJoinIndexJudgement judgement = new KnnJoinIndexJudgement( joinParams.k, + joinParams.searchRadius, joinParams.distanceMetric, includeTies, - broadcastedTreeIndex, + null, + broadcastObjectsTreeIndex, buildCount, streamCount, resultCount, candidateCount); - int numPartitionsObjects = objectRDD.rawSpatialRDD.getNumPartitions(); - joinResult = - queryRDD - .rawSpatialRDD - .repartition(numPartitionsObjects) - .zipPartitions(objectRDD.rawSpatialRDD, judgement); + // won't need inputs from the shapes in the objectRDD + joinResult = queryRDD.rawSpatialRDD.zipPartitions(queryRDD.rawSpatialRDD, judgement); + } else if (broadcastQueryObjects != null) { + // broadcast join with queryRDD as broadcast side + final KnnJoinIndexJudgement judgement = + new KnnJoinIndexJudgement( + joinParams.k, + joinParams.searchRadius, + joinParams.distanceMetric, + includeTies, + broadcastQueryObjects, + null, + buildCount, + streamCount, + resultCount, + candidateCount); + joinResult = querySideBroadcastKNNJoin(objectRDD, joinParams, judgement, includeTies); } else { throw new IllegalArgumentException("No index found on the input RDDs."); } @@ -833,6 +868,123 @@ public static JavaPairRDD knnJoin (PairFunction, U, T>) pair -> new Tuple2<>(pair.getKey(), pair.getValue())); } + /** + * Performs a KNN join where the query side is broadcasted. + * + *

This function performs a K-Nearest Neighbors (KNN) join operation where the query geometries + * are broadcasted to all partitions of the object geometries. + * + *

The function first maps partitions of the indexed raw RDD to perform the KNN join, then + * groups the results by the query geometry and keeps the top K pair for each query geometry based + * on the distance. + * + * @param objectRDD The set of geometries (neighbors) to be queried. + * @param joinParams The parameters for the join, including index type, number of neighbors (k), + * and distance metric. + * @param judgement The judgement function used to perform the KNN join. + * @param The type of the geometries in the queryRDD set. + * @param The type of the geometries in the objectRDD set. + * @return A JavaRDD of pairs where each pair contains a geometry from the queryRDD and a matching + * geometry from the objectRDD. + */ + private static + JavaRDD> querySideBroadcastKNNJoin( + SpatialRDD objectRDD, + JoinParams joinParams, + KnnJoinIndexJudgement judgement, + boolean includeTies) { + final JavaRDD> joinResult; + JavaRDD> joinResultMapped = + objectRDD.indexedRawRDD.mapPartitions( + iterator -> { + List> results = new ArrayList<>(); + if (iterator.hasNext()) { + SpatialIndex spatialIndex = iterator.next(); + // the broadcast join won't need inputs from the query's shape stream + Iterator> callResult = + judgement.call(null, Collections.singletonList(spatialIndex).iterator()); + callResult.forEachRemaining(results::add); + } + return results.iterator(); + }); + // this is to avoid serializable issues with the broadcast variable + int k = joinParams.k; + DistanceMetric distanceMetric = joinParams.distanceMetric; + + // Transform joinResultMapped to keep the top k pairs for each geometry + // (based on a grouping key and distance) + joinResult = + joinResultMapped + .groupBy(pair -> pair.getKey()) // Group by the first geometry + .flatMap( + (FlatMapFunction>>, Pair>) + pair -> { + Iterable> values = pair._2; + + // Extract and sort values by distance + List> sortedPairs = new ArrayList<>(); + for (Pair p : values) { + Pair newPair = + Pair.of( + (U) ((UniqueGeometry) p.getKey()).getOriginalGeometry(), + p.getValue()); + sortedPairs.add(newPair); + } + + // Sort pairs based on the distance function between the two geometries + sortedPairs.sort( + (p1, p2) -> { + double distance1 = + KnnJoinIndexJudgement.distance( + p1.getKey(), p1.getValue(), distanceMetric); + double distance2 = + KnnJoinIndexJudgement.distance( + p2.getKey(), p2.getValue(), distanceMetric); + return Double.compare( + distance1, distance2); // Sort ascending by distance + }); + + if (includeTies) { + // Keep the top k pairs, including ties + List> topPairs = new ArrayList<>(); + double kthDistance = -1; + for (int i = 0; i < sortedPairs.size(); i++) { + if (i < k) { + topPairs.add(sortedPairs.get(i)); + if (i == k - 1) { + kthDistance = + KnnJoinIndexJudgement.distance( + sortedPairs.get(i).getKey(), + sortedPairs.get(i).getValue(), + distanceMetric); + } + } else { + double currentDistance = + KnnJoinIndexJudgement.distance( + sortedPairs.get(i).getKey(), + sortedPairs.get(i).getValue(), + distanceMetric); + if (currentDistance == kthDistance) { + topPairs.add(sortedPairs.get(i)); + } else { + break; + } + } + } + return topPairs.iterator(); + } else { + // Keep the top k pairs without ties + List> topPairs = new ArrayList<>(); + for (int i = 0; i < Math.min(k, sortedPairs.size()); i++) { + topPairs.add(sortedPairs.get(i)); + } + return topPairs.iterator(); + } + }); + + return joinResult; + } + public static final class JoinParams { public final boolean useIndex; public final SpatialPredicate spatialPredicate; diff --git a/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/EqualPartitioning.java b/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/EqualPartitioning.java index 94793fa68a..2d1b95eb75 100644 --- a/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/EqualPartitioning.java +++ b/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/EqualPartitioning.java @@ -37,8 +37,19 @@ public class EqualPartitioning implements Serializable { /** The grids. */ List grids = new ArrayList(); - public EqualPartitioning(List grids) { + /** + * Whether to discard geometries that do not intersect any grid. If true, geometries that are not + * contained in a grid are placed into the overflow container. + */ + Boolean preserveUncontainedGeometries; + + public EqualPartitioning(List grids, boolean preserveUncontainedGeometries) { this.grids = grids; + this.preserveUncontainedGeometries = preserveUncontainedGeometries; + } + + public EqualPartitioning(List grids) { + this(grids, true); } /** * Instantiates a new equal partitioning. @@ -100,12 +111,12 @@ public Iterator> placeObject(Geometry geometry) { if (grid.covers(envelope)) { result.add(new Tuple2(i, geometry)); containFlag = true; - } else if (grid.intersects(envelope) || envelope.covers(grid)) { + } else if (grid.intersects(envelope)) { result.add(new Tuple2<>(i, geometry)); } } - if (!containFlag) { + if (!containFlag && preserveUncontainedGeometries) { result.add(new Tuple2<>(overflowContainerID, geometry)); } @@ -133,7 +144,7 @@ public Set getKeys(Geometry geometry) { } } - if (!containFlag) { + if (!containFlag && preserveUncontainedGeometries) { result.add(overflowContainerID); } return result; diff --git a/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/FlatGridPartitioner.java b/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/FlatGridPartitioner.java index e962a965ee..a50ce43f09 100644 --- a/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/FlatGridPartitioner.java +++ b/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/FlatGridPartitioner.java @@ -27,19 +27,39 @@ import org.locationtech.jts.geom.Geometry; import scala.Tuple2; +/** + * The FlatGridPartitioner is used when there is already a set of grids which the data should be + * partitioned into. It iterates through all the grids to find the grids to place a geometry into. + * Unless you have very few objects to place, it may make more sense to use the + * IndexedGridPartitioner. If you do not have a strict requirement to use a specific set of grids, + * it may make more sense to use another partitioner that generates its own grids from a + * space-partitioning tree, e.g. the KDBTreePartitioner or the QuadTreePartitioner. + */ public class FlatGridPartitioner extends SpatialPartitioner { - public FlatGridPartitioner(GridType gridType, List grids) { + protected final Boolean preserveUncontainedGeometries; + + public FlatGridPartitioner( + GridType gridType, List grids, Boolean preserveUncontainedGeometries) { super(gridType, grids); + this.preserveUncontainedGeometries = preserveUncontainedGeometries; + } + + public FlatGridPartitioner(GridType gridType, List grids) { + this(gridType, grids, true); + } + + public FlatGridPartitioner(List grids, Boolean preserveUncontainedGeometries) { + this(null, grids, preserveUncontainedGeometries); } // For backwards compatibility (see SpatialRDD.spatialPartitioning(otherGrids)) public FlatGridPartitioner(List grids) { - super(null, grids); + this(null, grids); } @Override public Iterator> placeObject(Geometry spatialObject) throws Exception { - EqualPartitioning partitioning = new EqualPartitioning(grids); + EqualPartitioning partitioning = new EqualPartitioning(grids, preserveUncontainedGeometries); return partitioning.placeObject(spatialObject); } @@ -61,7 +81,7 @@ public DedupParams getDedupParams() { @Override public int numPartitions() { - return grids.size() + 1 /* overflow partition */; + return grids.size() + (preserveUncontainedGeometries ? 1 : 0); } @Override diff --git a/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/GenericUniquePartitioner.java b/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/GenericUniquePartitioner.java new file mode 100644 index 0000000000..214446d6dd --- /dev/null +++ b/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/GenericUniquePartitioner.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sedona.core.spatialPartitioning; + +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import javax.annotation.Nullable; +import org.apache.sedona.core.enums.GridType; +import org.apache.sedona.core.joinJudgement.DedupParams; +import org.locationtech.jts.geom.Envelope; +import org.locationtech.jts.geom.Geometry; +import scala.Tuple2; + +public class GenericUniquePartitioner extends SpatialPartitioner { + private SpatialPartitioner parent; + + public GenericUniquePartitioner(SpatialPartitioner parent) { + this.parent = parent; + } + + public GridType getGridType() { + return parent.gridType; + } + + public List getGrids() { + return parent.grids; + } + + @Override + public Iterator> placeObject(Geometry spatialObject) throws Exception { + // Rather than take the first result from the parent, consume the entire iterator + // and return the partition with the minimum ID. This ensures that given the same + // (parent) partitioner, the output partitions from this method will be consistent. + Iterator> it = parent.placeObject(spatialObject); + int minParitionId = Integer.MAX_VALUE; + Geometry minGeometry = null; + while (it.hasNext()) { + Tuple2 value = it.next(); + if (value._1() < minParitionId) { + minParitionId = value._1(); + minGeometry = value._2(); + } + } + + HashSet> out = new HashSet>(); + if (minGeometry != null) { + out.add(new Tuple2(minParitionId, minGeometry)); + } + + return out.iterator(); + } + + @Override + @Nullable + public DedupParams getDedupParams() { + throw new UnsupportedOperationException("Unique partitioner cannot deduplicate join results"); + } + + @Override + public int numPartitions() { + return parent.numPartitions(); + } +} diff --git a/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/IndexedGridPartitioner.java b/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/IndexedGridPartitioner.java new file mode 100644 index 0000000000..ab8d5cde11 --- /dev/null +++ b/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/IndexedGridPartitioner.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sedona.core.spatialPartitioning; + +import java.util.*; +import org.apache.sedona.core.enums.GridType; +import org.locationtech.jts.geom.Envelope; +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.index.strtree.STRtree; +import scala.Tuple2; + +/** + * The IndexedGridPartitioner is used when there is already a set of grids which the data should be + * partitioned into. It leverages an STRTree to quickly find the grids to place a geometry into. If + * you have very few objects to place, it may make more sense to use the FlatGridPartitioner. If you + * do not have a strict requirement to use a specific set of grids, it may make more sense to use + * another partitioner that generates its own grids from space-partitioning tree, e.g. the + * KDBTreePartitioner or the QuadTreePartitioner. + */ +public class IndexedGridPartitioner extends FlatGridPartitioner { + private final STRtree index; + + public IndexedGridPartitioner( + GridType gridType, List grids, Boolean preserveUncontainedGeometries) { + super(gridType, grids, preserveUncontainedGeometries); + this.index = new STRtree(); + for (int i = 0; i < grids.size(); i++) { + final Envelope grid = grids.get(i); + index.insert(grid, i); + } + index.build(); + } + + public IndexedGridPartitioner(GridType gridType, List grids) { + this(gridType, grids, true); + } + + public IndexedGridPartitioner(List grids, Boolean preserveUncontainedGeometries) { + this(null, grids, preserveUncontainedGeometries); + } + + public IndexedGridPartitioner(List grids) { + this(null, grids); + } + + public STRtree getIndex() { + return index; + } + + @Override + public Iterator> placeObject(Geometry spatialObject) throws Exception { + List results = index.query(spatialObject.getEnvelopeInternal()); + if (preserveUncontainedGeometries) { + // borrowed from EqualPartitioning.placeObject + final int overflowContainerID = grids.size(); + final Envelope envelope = spatialObject.getEnvelopeInternal(); + + Set> result = new HashSet(); + boolean containFlag = false; + for (Object queryResult : results) { + Integer i = (Integer) queryResult; + final Envelope grid = grids.get(i); + if (grid.covers(envelope)) { + result.add(new Tuple2(i, spatialObject)); + containFlag = true; + } else if (grid.intersects(envelope)) { + result.add(new Tuple2<>(i, spatialObject)); + } + } + + if (!containFlag) { + result.add(new Tuple2<>(overflowContainerID, spatialObject)); + } + + return result.iterator(); + } else { + return results.stream().map(i -> new Tuple2(i, spatialObject)).iterator(); + } + } +} diff --git a/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/KDB.java b/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/KDB.java index be3ba72cc3..4bd615383a 100644 --- a/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/KDB.java +++ b/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/KDB.java @@ -33,7 +33,10 @@ import org.locationtech.jts.geom.Point; import scala.Tuple2; -/** see https://en.wikipedia.org/wiki/K-D-B-tree */ +/** + * see https://en.wikipedia.org/wiki/K-D-B-tree + */ public class KDB extends PartitioningUtils implements Serializable { private final int maxItemsPerNode; diff --git a/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/SpatialPartitioner.java b/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/SpatialPartitioner.java index c7deb3b704..96594c5c10 100644 --- a/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/SpatialPartitioner.java +++ b/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/SpatialPartitioner.java @@ -35,6 +35,11 @@ public abstract class SpatialPartitioner extends Partitioner implements Serializ protected final GridType gridType; protected final List grids; + protected SpatialPartitioner() { + gridType = null; + grids = null; + } + protected SpatialPartitioner(GridType gridType, List grids) { this.gridType = gridType; this.grids = Objects.requireNonNull(grids, "grids"); diff --git a/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/quadtree/ExtendedQuadTree.java b/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/quadtree/ExtendedQuadTree.java index 73169363ee..9925e93f8a 100644 --- a/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/quadtree/ExtendedQuadTree.java +++ b/spark/common/src/main/java/org/apache/sedona/core/spatialPartitioning/quadtree/ExtendedQuadTree.java @@ -146,8 +146,11 @@ public Iterator> placeObject(Geometry geometry) { final Set> result = new HashSet<>(); for (QuadRectangle rectangle : matchedPartitions) { + // Ignore null or empty point + if (point == null || point.isEmpty()) break; + // For points, make sure to return only one partition - if (point != null && !(new HalfOpenRectangle(rectangle.getEnvelope())).contains(point)) { + if (!(new HalfOpenRectangle(rectangle.getEnvelope())).contains(point)) { continue; } diff --git a/spark/common/src/main/java/org/apache/sedona/core/spatialRDD/SpatialRDD.java b/spark/common/src/main/java/org/apache/sedona/core/spatialRDD/SpatialRDD.java index d81b916183..b8b46ae35e 100644 --- a/spark/common/src/main/java/org/apache/sedona/core/spatialRDD/SpatialRDD.java +++ b/spark/common/src/main/java/org/apache/sedona/core/spatialRDD/SpatialRDD.java @@ -42,6 +42,7 @@ import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFlatMapFunction; +import org.apache.spark.sql.types.StructType; import org.apache.spark.storage.StorageLevel; import org.apache.spark.util.random.SamplingUtils; import org.locationtech.jts.geom.Coordinate; @@ -85,6 +86,8 @@ public class SpatialRDD implements Serializable { public JavaRDD rawSpatialRDD; public List fieldNames; + + public StructType schema; /** The CR stransformation. */ protected boolean CRStransformation = false; /** The source epsg code. */ @@ -159,11 +162,71 @@ public boolean spatialPartitioning(GridType gridType) throws Exception { return true; } + public boolean spatialParitioningWithoutDuplicates(GridType gridType) throws Exception { + int numPartitions = this.rawSpatialRDD.rdd().partitions().length; + spatialPartitioningWithoutDuplicates(gridType, numPartitions); + return true; + } + + /** + * Calculate non-duplicate inducing partitioning + * + *

Note that non-duplicating partitioners are intended for use by distributed partitioned + * writers and not able to be used for spatial joins. + * + * @param gridType The target GridType + * @param numPartitions The target number of partitions + * @throws Exception + */ + public void spatialPartitioningWithoutDuplicates(GridType gridType, int numPartitions) + throws Exception { + calc_partitioner(gridType, numPartitions); + partitioner = new GenericUniquePartitioner(partitioner); + this.spatialPartitionedRDD = partition(partitioner); + } + + /** + * Calculate non-duplicate inducing partitioning from an existing SpatialPartitioner + * + *

Note that non-duplicating partitioners are intended for use by distributed partitioned + * writers and not able to be used for spatial joins. + * + * @param partitioner An existing partitioner obtained from the partitioning of another + * SpatialRDD. + * @throws Exception + */ + public void spatialPartitioningWithoutDuplicates(SpatialPartitioner partitioner) { + partitioner = new GenericUniquePartitioner(partitioner); + this.spatialPartitionedRDD = partition(partitioner); + } + + /** + * Calculate non-duplicate inducing partitioning based on a list of existing envelopes + * + *

This is shorthand for spatialPartitioningWithoutDuplicates(new IndexedGridPartitioner()). + * Using spatialPartitioningWithoutDuplicates(gridType, numPartitions) is typically more + * appropriate because it is able to adapt to the content of the partition and is able to produce + * more consistently balanced partitions. + * + *

Note that non-duplicating partitioners are intended for use by distributed partitioned + * writers and not able to be used for spatial joins. + * + * @param otherGrids A list of existing envelopes + * @return true on success + * @throws Exception + */ + public boolean spatialPartitioningWithoutDuplicates(final List otherGrids) + throws Exception { + this.partitioner = new GenericUniquePartitioner(new IndexedGridPartitioner(otherGrids)); + this.spatialPartitionedRDD = partition(partitioner); + return true; + } + /** * Spatial partitioning. * * @param gridType the grid type - * @return true, if successful + * @param numPartitions the target number of partitions * @throws Exception the exception */ public void calc_partitioner(GridType gridType, int numPartitions) throws Exception { @@ -278,7 +341,7 @@ public void spatialPartitioning(SpatialPartitioner partitioner) { /** @deprecated Use spatialPartitioning(SpatialPartitioner partitioner) */ public boolean spatialPartitioning(final List otherGrids) throws Exception { - this.partitioner = new FlatGridPartitioner(otherGrids); + this.partitioner = new IndexedGridPartitioner(otherGrids); this.spatialPartitionedRDD = partition(partitioner); return true; } diff --git a/spark/common/src/main/java/org/apache/sedona/core/utils/SedonaConf.java b/spark/common/src/main/java/org/apache/sedona/core/utils/SedonaConf.java index 28685c6a03..d02e96df93 100644 --- a/spark/common/src/main/java/org/apache/sedona/core/utils/SedonaConf.java +++ b/spark/common/src/main/java/org/apache/sedona/core/utils/SedonaConf.java @@ -59,6 +59,9 @@ public class SedonaConf implements Serializable { // Parameters for knn joins private boolean includeTieBreakersInKNNJoins = false; + // Parameters for geostats + private Boolean DBSCANIncludeOutliers = true; + public static SedonaConf fromActiveSession() { return new SedonaConf(SparkSession.active().conf()); } @@ -98,6 +101,10 @@ public SedonaConf(RuntimeConfig runtimeConfig) { // Parameters for knn joins this.includeTieBreakersInKNNJoins = Boolean.parseBoolean(getConfigValue(runtimeConfig, "join.knn.includeTieBreakers", "false")); + + // Parameters for geostats + this.DBSCANIncludeOutliers = + Boolean.parseBoolean(runtimeConfig.get("spark.sedona.dbscan.includeOutliers", "true")); } // Helper method to prioritize `sedona.*` over `spark.sedona.*` @@ -182,4 +189,8 @@ static long bytesFromString(String str) { public SpatialJoinOptimizationMode getSpatialJoinOptimizationMode() { return spatialJoinOptimizationMode; } + + public Boolean getDBSCANIncludeOutliers() { + return DBSCANIncludeOutliers; + } } diff --git a/spark/common/src/main/java/org/apache/sedona/core/wrapper/UniqueGeometry.java b/spark/common/src/main/java/org/apache/sedona/core/wrapper/UniqueGeometry.java new file mode 100644 index 0000000000..01f20f2fa6 --- /dev/null +++ b/spark/common/src/main/java/org/apache/sedona/core/wrapper/UniqueGeometry.java @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sedona.core.wrapper; + +import java.util.UUID; +import org.apache.commons.lang3.NotImplementedException; +import org.locationtech.jts.geom.*; + +public class UniqueGeometry extends Geometry { + private final T originalGeometry; + private final String uniqueId; + + public UniqueGeometry(T originalGeometry) { + super(new GeometryFactory()); + this.originalGeometry = originalGeometry; + this.uniqueId = UUID.randomUUID().toString(); + } + + public T getOriginalGeometry() { + return originalGeometry; + } + + public String getUniqueId() { + return uniqueId; + } + + @Override + public int hashCode() { + return uniqueId.hashCode(); // Uniqueness ensured by uniqueId + } + + @Override + public String getGeometryType() { + throw new NotImplementedException("getGeometryType is not implemented."); + } + + @Override + public Coordinate getCoordinate() { + throw new NotImplementedException("getCoordinate is not implemented."); + } + + @Override + public Coordinate[] getCoordinates() { + throw new NotImplementedException("getCoordinates is not implemented."); + } + + @Override + public int getNumPoints() { + throw new NotImplementedException("getNumPoints is not implemented."); + } + + @Override + public boolean isEmpty() { + throw new NotImplementedException("isEmpty is not implemented."); + } + + @Override + public int getDimension() { + throw new NotImplementedException("getDimension is not implemented."); + } + + @Override + public Geometry getBoundary() { + throw new NotImplementedException("getBoundary is not implemented."); + } + + @Override + public int getBoundaryDimension() { + throw new NotImplementedException("getBoundaryDimension is not implemented."); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null || getClass() != obj.getClass()) return false; + UniqueGeometry that = (UniqueGeometry) obj; + return uniqueId.equals(that.uniqueId); + } + + @Override + public String toString() { + return "UniqueGeometry{" + + "originalGeometry=" + + originalGeometry + + ", uniqueId='" + + uniqueId + + '\'' + + '}'; + } + + @Override + protected Geometry reverseInternal() { + throw new NotImplementedException("reverseInternal is not implemented."); + } + + @Override + public boolean equalsExact(Geometry geometry, double v) { + throw new NotImplementedException("equalsExact is not implemented."); + } + + @Override + public void apply(CoordinateFilter coordinateFilter) { + throw new NotImplementedException("apply(CoordinateFilter) is not implemented."); + } + + @Override + public void apply(CoordinateSequenceFilter coordinateSequenceFilter) { + throw new NotImplementedException("apply(CoordinateSequenceFilter) is not implemented."); + } + + @Override + public void apply(GeometryFilter geometryFilter) { + throw new NotImplementedException("apply(GeometryFilter) is not implemented."); + } + + @Override + public void apply(GeometryComponentFilter geometryComponentFilter) { + throw new NotImplementedException("apply(GeometryComponentFilter) is not implemented."); + } + + @Override + protected Geometry copyInternal() { + throw new NotImplementedException("copyInternal is not implemented."); + } + + @Override + public void normalize() { + throw new NotImplementedException("normalize is not implemented."); + } + + @Override + protected Envelope computeEnvelopeInternal() { + throw new NotImplementedException("computeEnvelopeInternal is not implemented."); + } + + @Override + protected int compareToSameClass(Object o) { + throw new NotImplementedException("compareToSameClass(Object) is not implemented."); + } + + @Override + protected int compareToSameClass( + Object o, CoordinateSequenceComparator coordinateSequenceComparator) { + throw new NotImplementedException( + "compareToSameClass(Object, CoordinateSequenceComparator) is not implemented."); + } + + @Override + protected int getTypeCode() { + throw new NotImplementedException("getTypeCode is not implemented."); + } +} diff --git a/spark/common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/spark/common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister index b664d1db50..162e44369f 100644 --- a/spark/common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister +++ b/spark/common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister @@ -1,3 +1,4 @@ org.apache.spark.sql.sedona_sql.io.raster.RasterFileFormat org.apache.spark.sql.sedona_sql.io.geojson.GeoJSONFileFormat org.apache.sedona.sql.datasources.spider.SpiderDataSource +org.apache.spark.sql.sedona_sql.io.stac.StacDataSource diff --git a/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala b/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala index 4c8fcab692..fe2926fc51 100644 --- a/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala +++ b/spark/common/src/main/scala/org/apache/sedona/spark/SedonaContext.scala @@ -20,11 +20,12 @@ package org.apache.sedona.spark import org.apache.sedona.common.utils.TelemetryCollector import org.apache.sedona.core.serde.SedonaKryoRegistrator -import org.apache.sedona.sql.{ParserRegistrator, RasterRegistrator} +import org.apache.sedona.sql.RasterRegistrator import org.apache.sedona.sql.UDF.UdfRegistrator import org.apache.sedona.sql.UDT.UdtRegistrator import org.apache.spark.serializer.KryoSerializer -import org.apache.spark.sql.sedona_sql.optimization.SpatialFilterPushDownForGeoParquet +import org.apache.spark.sql.sedona_sql.optimization.{ExtractGeoStatsFunctions, SpatialFilterPushDownForGeoParquet, SpatialTemporalFilterPushDownForStacScan} +import org.apache.spark.sql.sedona_sql.strategy.geostats.EvalGeoStatsFunctionStrategy import org.apache.spark.sql.sedona_sql.strategy.join.JoinQueryDetector import org.apache.spark.sql.{SQLContext, SparkSession} @@ -36,6 +37,12 @@ class InternalApi( extends StaticAnnotation object SedonaContext { + + private def customOptimizationsWithSession(sparkSession: SparkSession) = + Seq( + new SpatialFilterPushDownForGeoParquet(sparkSession), + new SpatialTemporalFilterPushDownForStacScan(sparkSession)) + def create(sqlContext: SQLContext): SQLContext = { create(sqlContext.sparkSession) sqlContext @@ -56,18 +63,30 @@ object SedonaContext { if (!sparkSession.experimental.extraStrategies.exists(_.isInstanceOf[JoinQueryDetector])) { sparkSession.experimental.extraStrategies ++= Seq(new JoinQueryDetector(sparkSession)) } - if (!sparkSession.experimental.extraOptimizations.exists( - _.isInstanceOf[SpatialFilterPushDownForGeoParquet])) { - sparkSession.experimental.extraOptimizations ++= Seq( - new SpatialFilterPushDownForGeoParquet(sparkSession)) + + customOptimizationsWithSession(sparkSession).foreach { opt => + if (!sparkSession.experimental.extraOptimizations.exists { + case _: opt.type => true + case _ => false + }) { + sparkSession.experimental.extraOptimizations ++= Seq(opt) + } + } + + // Support geostats functions + if (!sparkSession.experimental.extraOptimizations.contains(ExtractGeoStatsFunctions)) { + sparkSession.experimental.extraOptimizations ++= Seq(ExtractGeoStatsFunctions) + } + if (!sparkSession.experimental.extraStrategies.exists( + _.isInstanceOf[EvalGeoStatsFunctionStrategy])) { + sparkSession.experimental.extraStrategies ++= Seq( + new EvalGeoStatsFunctionStrategy(sparkSession)) } + addGeoParquetToSupportNestedFilterSources(sparkSession) RasterRegistrator.registerAll(sparkSession) UdtRegistrator.registerAll() UdfRegistrator.registerAll(sparkSession) - if (sparkSession.conf.get("spark.sedona.enableParserExtension", "true").toBoolean) { - ParserRegistrator.register(sparkSession) - } sparkSession } diff --git a/spark/common/src/main/scala/org/apache/sedona/sql/ParserRegistrator.scala b/spark/common/src/main/scala/org/apache/sedona/sql/ParserRegistrator.scala deleted file mode 100644 index db3c623a09..0000000000 --- a/spark/common/src/main/scala/org/apache/sedona/sql/ParserRegistrator.scala +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.sedona.sql - -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.parser.ParserInterface -import org.apache.spark.sql.parser.ParserFactory - -object ParserRegistrator { - - /** - * Register the custom Sedona Spark parser - * @param sparkSession - */ - def register(sparkSession: SparkSession): Unit = { - // try to register the parser with the new constructor for spark 3.1 and above - try { - val parserClassName = "org.apache.sedona.sql.parser.SedonaSqlParser" - val delegate: ParserInterface = sparkSession.sessionState.sqlParser - - val parser = ParserFactory.getParser(parserClassName, delegate) - val field = sparkSession.sessionState.getClass.getDeclaredField("sqlParser") - field.setAccessible(true) - field.set(sparkSession.sessionState, parser) - return // return if the new constructor is available - } catch { - case _: Exception => - } - - // try to register the parser with the legacy constructor for spark 3.0 - try { - val parserClassName = "org.apache.sedona.sql.parser.SedonaSqlParser" - val delegate: ParserInterface = sparkSession.sessionState.sqlParser - - val parser = - ParserFactory.getParser(parserClassName, sparkSession.sessionState.conf, delegate) - val field = sparkSession.sessionState.getClass.getDeclaredField("sqlParser") - field.setAccessible(true) - field.set(sparkSession.sessionState, parser) - } catch { - case _: Exception => - } - } -} diff --git a/spark/common/src/main/scala/org/apache/sedona/sql/SedonaSqlExtensions.scala b/spark/common/src/main/scala/org/apache/sedona/sql/SedonaSqlExtensions.scala index be0774ac90..fbc3567192 100644 --- a/spark/common/src/main/scala/org/apache/sedona/sql/SedonaSqlExtensions.scala +++ b/spark/common/src/main/scala/org/apache/sedona/sql/SedonaSqlExtensions.scala @@ -19,13 +19,24 @@ package org.apache.sedona.sql import org.apache.sedona.spark.SedonaContext +import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSessionExtensions +import org.apache.spark.sql.parser.ParserFactory class SedonaSqlExtensions extends (SparkSessionExtensions => Unit) { + private lazy val enableParser = + SparkContext.getOrCreate().getConf.get("spark.sedona.enableParserExtension", "true").toBoolean + def apply(e: SparkSessionExtensions): Unit = { e.injectCheckRule(spark => { SedonaContext.create(spark) _ => () }) + + if (enableParser) { + e.injectParser { case (_, parser) => + ParserFactory.getParser("org.apache.sedona.sql.parser.SedonaSqlParser", parser) + } + } } } diff --git a/spark/common/src/main/scala/org/apache/sedona/sql/UDF/Catalog.scala b/spark/common/src/main/scala/org/apache/sedona/sql/UDF/Catalog.scala index b491375379..0bffa54baf 100644 --- a/spark/common/src/main/scala/org/apache/sedona/sql/UDF/Catalog.scala +++ b/spark/common/src/main/scala/org/apache/sedona/sql/UDF/Catalog.scala @@ -38,6 +38,7 @@ object Catalog { val expressions: Seq[FunctionDescription] = Seq( // Expression for vectors function[GeometryType](), + function[ST_LabelPoint](), function[ST_PointFromText](), function[ST_PointFromWKB](), function[ST_LineFromWKB](), @@ -56,6 +57,7 @@ object Catalog { function[ST_GeomFromKML](), function[ST_CoordDim](), function[ST_Perimeter](), + function[ST_Perimeter2D](), function[ST_Point](), function[ST_Points](), function[ST_MakeEnvelope](), @@ -168,6 +170,7 @@ object Catalog { function[ST_IsPolygonCCW](), function[ST_ForcePolygonCCW](), function[ST_FlipCoordinates](), + function[ST_LineSegments](), function[ST_LineSubstring](), function[ST_LineInterpolatePoint](), function[ST_LineLocatePoint](), @@ -338,7 +341,13 @@ object Catalog { function[RS_Resample](), function[RS_ReprojectMatch]("nearestneighbor"), function[RS_FromNetCDF](), - function[RS_NetCDFInfo]()) + function[RS_NetCDFInfo](), + // geostats functions + function[ST_DBSCAN](), + function[ST_LocalOutlierFactor](), + function[ST_GLocal](), + function[ST_BinaryDistanceBandColumn](), + function[ST_WeightedDistanceBandColumn]()) // Aggregate functions with Geometry as buffer val aggregateExpressions: Seq[Aggregator[Geometry, Geometry, Geometry]] = diff --git a/spark/common/src/main/scala/org/apache/sedona/sql/utils/Adapter.scala b/spark/common/src/main/scala/org/apache/sedona/sql/utils/Adapter.scala index 9b1067a25a..96aab1287e 100644 --- a/spark/common/src/main/scala/org/apache/sedona/sql/utils/Adapter.scala +++ b/spark/common/src/main/scala/org/apache/sedona/sql/utils/Adapter.scala @@ -29,6 +29,13 @@ import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.locationtech.jts.geom.Geometry +/** + * Adapter for converting between DataFrame and SpatialRDD. It provides methods to convert + * DataFrame to SpatialRDD and vice versa. The schema information is lost during conversion. It is + * different from [[org.apache.spark.sql.sedona_sql.adapters.StructuredAdapter]] which does not + * lose the schema information during conversion. This should be used if your data starts as a + * SpatialRDD and you want to convert it to DataFrame. + */ object Adapter { /** diff --git a/spark/common/src/main/scala/org/apache/sedona/sql/utils/GeometrySerializer.scala b/spark/common/src/main/scala/org/apache/sedona/sql/utils/GeometrySerializer.scala index a13c181aa2..a75a88f7ba 100644 --- a/spark/common/src/main/scala/org/apache/sedona/sql/utils/GeometrySerializer.scala +++ b/spark/common/src/main/scala/org/apache/sedona/sql/utils/GeometrySerializer.scala @@ -19,7 +19,7 @@ package org.apache.sedona.sql.utils import org.apache.sedona.common.geometrySerde -import org.locationtech.jts.geom.Geometry +import org.locationtech.jts.geom.{Geometry, GeometryFactory} /** * SerDe using the WKB reader and writer objects @@ -47,6 +47,9 @@ object GeometrySerializer { * JTS geometry */ def deserialize(value: Array[Byte]): Geometry = { + if (value == null) { + return new GeometryFactory().createGeometryCollection() + } geometrySerde.GeometrySerializer.deserialize(value) } } diff --git a/spark/common/src/main/scala/org/apache/sedona/stats/Weighting.scala b/spark/common/src/main/scala/org/apache/sedona/stats/Weighting.scala index 6d5a273854..7713674261 100644 --- a/spark/common/src/main/scala/org/apache/sedona/stats/Weighting.scala +++ b/spark/common/src/main/scala/org/apache/sedona/stats/Weighting.scala @@ -18,7 +18,7 @@ */ package org.apache.sedona.stats -import org.apache.sedona.stats.Util.getGeometryColumnName +import org.apache.sedona.util.DfUtils.getGeometryColumnName import org.apache.spark.sql.functions._ import org.apache.spark.sql.sedona_sql.expressions.st_functions.{ST_Distance, ST_DistanceSpheroid} import org.apache.spark.sql.{Column, DataFrame} @@ -54,6 +54,10 @@ object Weighting { * name of the geometry column * @param useSpheroid * whether to use a cartesian or spheroidal distance calculation. Default is false + * @param savedAttributes + * the attributes to save in the neighbor column. Default is all columns. + * @param resultName + * the name of the resulting column. Default is 'weights'. * @return * The input DataFrame with a weight column added containing neighbors and their weights added * to each row. @@ -67,13 +71,15 @@ object Weighting { includeSelf: Boolean = false, selfWeight: Double = 1.0, geometry: String = null, - useSpheroid: Boolean = false): DataFrame = { + useSpheroid: Boolean = false, + savedAttributes: Seq[String] = null, + resultName: String = "weights"): DataFrame = { require(threshold >= 0, "Threshold must be greater than or equal to 0") require(alpha < 0, "Alpha must be less than 0") val geometryColumn = geometry match { - case null => getGeometryColumnName(dataframe) + case null => getGeometryColumnName(dataframe.schema) case _ => require( dataframe.schema.fields.exists(_.name == geometry), @@ -81,6 +87,12 @@ object Weighting { geometry } + // Always include the geometry column in the saved attributes + val savedAttributesWithGeom = + if (savedAttributes == null) null + else if (!savedAttributes.contains(geometryColumn)) savedAttributes :+ geometryColumn + else savedAttributes + val distanceFunction: (Column, Column) => Column = if (useSpheroid) ST_DistanceSpheroid else ST_Distance @@ -96,14 +108,6 @@ object Weighting { val formattedDataFrame = dataframe.withColumn(ID_COLUMN, sha2(to_json(struct("*")), 256)) - // Since spark 3.0 doesn't support dropFields, we need a work around - val withoutId = (prefix: String, colFunc: String => Column) => { - formattedDataFrame.schema.fields - .map(_.name) - .filter(name => name != ID_COLUMN) - .map(x => colFunc(prefix + "." + x).alias(x)) - } - formattedDataFrame .alias("l") .join( @@ -116,7 +120,13 @@ object Weighting { col(s"l.$ID_COLUMN"), struct("l.*").alias("left_contents"), struct( - struct(withoutId("r", col): _*).alias("neighbor"), + ( + savedAttributesWithGeom match { + case null => struct(col("r.*")).dropFields(ID_COLUMN) + case _ => + struct(savedAttributesWithGeom.map(c => col(s"r.$c")): _*) + } + ).alias("neighbor"), if (!binary) pow(distanceFunction(col(s"l.$geometryColumn"), col(s"r.$geometryColumn")), alpha) .alias("value") @@ -127,14 +137,18 @@ object Weighting { concat( collect_list(col("weight")), if (includeSelf) - array( - struct( - struct(withoutId("left_contents", first): _*).alias("neighbor"), - lit(selfWeight).alias("value"))) - else array()).alias("weights")) - .select("left_contents.*", "weights") + array(struct( + (savedAttributesWithGeom match { + case null => first("left_contents").dropFields(ID_COLUMN) + case _ => + struct( + savedAttributesWithGeom.map(c => first(s"left_contents.$c").alias(c)): _*) + }).alias("neighbor"), + lit(selfWeight).alias("value"))) + else array()).alias(resultName)) + .select("left_contents.*", resultName) .drop(ID_COLUMN) - .withColumn("weights", filter(col("weights"), _(f"neighbor")(geometryColumn).isNotNull)) + .withColumn(resultName, filter(col(resultName), _(f"neighbor")(geometryColumn).isNotNull)) } /** @@ -158,6 +172,10 @@ object Weighting { * name of the geometry column * @param useSpheroid * whether to use a cartesian or spheroidal distance calculation. Default is false + * @param savedAttributes + * the attributes to save in the neighbor column. Default is all columns. + * @param resultName + * the name of the resulting column. Default is 'weights'. * @return * The input DataFrame with a weight column added containing neighbors and their weights * (always 1) added to each row. @@ -168,13 +186,73 @@ object Weighting { includeZeroDistanceNeighbors: Boolean = true, includeSelf: Boolean = false, geometry: String = null, - useSpheroid: Boolean = false): DataFrame = addDistanceBandColumn( + useSpheroid: Boolean = false, + savedAttributes: Seq[String] = null, + resultName: String = "weights"): DataFrame = addDistanceBandColumn( dataframe, threshold, binary = true, includeZeroDistanceNeighbors = includeZeroDistanceNeighbors, includeSelf = includeSelf, geometry = geometry, - useSpheroid = useSpheroid) + useSpheroid = useSpheroid, + savedAttributes = savedAttributes, + resultName = resultName) + + /** + * Annotates a dataframe with a weights column for each data record containing the other members + * within the threshold and their weight. Weights will be dist^alpha. The dataframe should + * contain at least one GeometryType column. Rows must be unique. If one geometry column is + * present it will be used automatically. If two are present, the one named 'geometry' will be + * used. If more than one are present and neither is named 'geometry', the column name must be + * provided. The new column will be named 'cluster'. + * + * @param dataframe + * DataFrame with geometry column + * @param threshold + * Distance threshold for considering neighbors + * @param alpha + * alpha to use for inverse distance weights. Computation is dist^alpha. Default is -1.0 + * @param includeZeroDistanceNeighbors + * whether to include neighbors that are 0 distance. If 0 distance neighbors are included and + * binary is false, values are infinity as per the floating point spec (divide by 0) + * @param includeSelf + * whether to include self in the list of neighbors + * @param selfWeight + * the weight to provide for the self as its own neighbor. Default is 1.0 + * @param geometry + * name of the geometry column + * @param useSpheroid + * whether to use a cartesian or spheroidal distance calculation. Default is false + * @param savedAttributes + * the attributes to save in the neighbor column. Default is all columns. + * @param resultName + * the name of the resulting column. Default is 'weights'. + * @return + * The input DataFrame with a weight column added containing neighbors and their weights + * (dist^alpha) added to each row. + */ + def addWeightedDistanceBandColumn( + dataframe: DataFrame, + threshold: Double, + alpha: Double = -1.0, + includeZeroDistanceNeighbors: Boolean = false, + includeSelf: Boolean = false, + selfWeight: Double = 1.0, + geometry: String = null, + useSpheroid: Boolean = false, + savedAttributes: Seq[String] = null, + resultName: String = "weights"): DataFrame = addDistanceBandColumn( + dataframe, + threshold, + alpha = alpha, + binary = false, + includeZeroDistanceNeighbors = includeZeroDistanceNeighbors, + includeSelf = includeSelf, + selfWeight = selfWeight, + geometry = geometry, + useSpheroid = useSpheroid, + savedAttributes = savedAttributes, + resultName = resultName) } diff --git a/spark/common/src/main/scala/org/apache/sedona/stats/clustering/DBSCAN.scala b/spark/common/src/main/scala/org/apache/sedona/stats/clustering/DBSCAN.scala index e4cd1f90b4..c75291d971 100644 --- a/spark/common/src/main/scala/org/apache/sedona/stats/clustering/DBSCAN.scala +++ b/spark/common/src/main/scala/org/apache/sedona/stats/clustering/DBSCAN.scala @@ -18,7 +18,7 @@ */ package org.apache.sedona.stats.clustering -import org.apache.sedona.stats.Util.getGeometryColumnName +import org.apache.sedona.util.DfUtils.getGeometryColumnName import org.apache.spark.sql.functions._ import org.apache.spark.sql.sedona_sql.UDT.GeometryUDT import org.apache.spark.sql.sedona_sql.expressions.st_functions.{ST_Distance, ST_DistanceSpheroid} @@ -48,6 +48,11 @@ object DBSCAN { * whether to include outliers in the output. Default is false * @param useSpheroid * whether to use a cartesian or spheroidal distance calculation. Default is false + * @param isCoreColumnName + * what the name of the column indicating if this is a core point should be. Default is + * "isCore" + * @param clusterColumnName + * what the name of the column indicating the cluster id should be. Default is "cluster" * @return * The input DataFrame with the cluster label added to each row. Outlier will have a cluster * value of -1 if included. @@ -58,10 +63,12 @@ object DBSCAN { minPts: Int, geometry: String = null, includeOutliers: Boolean = true, - useSpheroid: Boolean = false): DataFrame = { + useSpheroid: Boolean = false, + isCoreColumnName: String = "isCore", + clusterColumnName: String = "cluster"): DataFrame = { val geometryCol = geometry match { - case null => getGeometryColumnName(dataframe) + case null => getGeometryColumnName(dataframe.schema) case _ => geometry } @@ -89,12 +96,12 @@ object DBSCAN { first(struct("left.*")).alias("leftContents"), count(col(s"right.id")).alias("neighbors_count"), collect_list(col(s"right.id")).alias("neighbors")) - .withColumn("isCore", col("neighbors_count") >= lit(minPts)) - .select("leftContents.*", "neighbors", "isCore") + .withColumn(isCoreColumnName, col("neighbors_count") >= lit(minPts)) + .select("leftContents.*", "neighbors", isCoreColumnName) .checkpoint() - val corePointsDF = isCorePointsDF.filter(col("isCore")) - val borderPointsDF = isCorePointsDF.filter(!col("isCore")) + val corePointsDF = isCorePointsDF.filter(col(isCoreColumnName)) + val borderPointsDF = isCorePointsDF.filter(!col(isCoreColumnName)) val coreEdgesDf = corePointsDF .select(col("id").alias("src"), explode(col("neighbors")).alias("dst")) @@ -117,14 +124,14 @@ object DBSCAN { val outliersDf = idDataframe .join(clusteredPointsDf, Seq("id"), "left_anti") - .withColumn("isCore", lit(false)) + .withColumn(isCoreColumnName, lit(false)) .withColumn("component", lit(-1)) .withColumn("neighbors", array().cast("array")) val completedDf = ( if (includeOutliers) clusteredPointsDf.unionByName(outliersDf) else clusteredPointsDf - ).withColumnRenamed("component", "cluster") + ).withColumnRenamed("component", clusterColumnName) val returnDf = if (hasIdColumn) { completedDf.drop("neighbors", "id").withColumnRenamed(ID_COLUMN, "id") diff --git a/spark/common/src/main/scala/org/apache/sedona/stats/outlierDetection/LocalOutlierFactor.scala b/spark/common/src/main/scala/org/apache/sedona/stats/outlierDetection/LocalOutlierFactor.scala index b98919de25..2595a90852 100644 --- a/spark/common/src/main/scala/org/apache/sedona/stats/outlierDetection/LocalOutlierFactor.scala +++ b/spark/common/src/main/scala/org/apache/sedona/stats/outlierDetection/LocalOutlierFactor.scala @@ -18,8 +18,8 @@ */ package org.apache.sedona.stats.outlierDetection -import org.apache.sedona.stats.Util.getGeometryColumnName -import org.apache.spark.sql.sedona_sql.expressions.st_functions.{ST_Distance, ST_DistanceSpheroid} +import org.apache.sedona.util.DfUtils.getGeometryColumnName +import org.apache.spark.sql.sedona_sql.expressions.st_functions.{ST_Distance, ST_DistanceSphere} import org.apache.spark.sql.{Column, DataFrame, SparkSession, functions => f} object LocalOutlierFactor { @@ -42,8 +42,10 @@ object LocalOutlierFactor { * name of the geometry column * @param handleTies * whether to handle ties in the k-distance calculation. Default is false - * @param useSpheroid + * @param useSphere * whether to use a cartesian or spheroidal distance calculation. Default is false + * @param resultColumnName + * the name of the column containing the lof for each row. Default is "lof" * * @return * A DataFrame containing the lof for each row @@ -53,7 +55,8 @@ object LocalOutlierFactor { k: Int = 20, geometry: String = null, handleTies: Boolean = false, - useSpheroid: Boolean = false): DataFrame = { + useSphere: Boolean = false, + resultColumnName: String = "lof"): DataFrame = { if (k < 1) throw new IllegalArgumentException("k must be a positive integer") @@ -67,10 +70,11 @@ object LocalOutlierFactor { } else "false" // else case to make compiler happy val distanceFunction: (Column, Column) => Column = - if (useSpheroid) ST_DistanceSpheroid else ST_Distance - val useSpheroidString = if (useSpheroid) "True" else "False" // for the SQL expression + if (useSphere) ST_DistanceSphere else ST_Distance + val useSpheroidString = if (useSphere) "True" else "False" // for the SQL expression - val geometryColumn = if (geometry == null) getGeometryColumnName(dataframe) else geometry + val geometryColumn = + if (geometry == null) getGeometryColumnName(dataframe.schema) else geometry val KNNFunction = "ST_KNN" @@ -136,8 +140,8 @@ object LocalOutlierFactor { .groupBy("a_id") .agg( f.first(CONTENTS_COLUMN_NAME).alias(CONTENTS_COLUMN_NAME), - (f.sum("b_lrd") / (f.count("b_lrd") * f.first("a_lrd"))).alias("lof")) - .select(f.col(f"$CONTENTS_COLUMN_NAME.*"), f.col("lof")) + (f.sum("b_lrd") / (f.count("b_lrd") * f.first("a_lrd"))).alias(resultColumnName)) + .select(f.col(f"$CONTENTS_COLUMN_NAME.*"), f.col(resultColumnName)) if (handleTies) SparkSession.getActiveSession.get.conf diff --git a/spark/common/src/main/scala/org/apache/sedona/stats/Util.scala b/spark/common/src/main/scala/org/apache/sedona/util/DfUtils.scala similarity index 84% rename from spark/common/src/main/scala/org/apache/sedona/stats/Util.scala rename to spark/common/src/main/scala/org/apache/sedona/util/DfUtils.scala index cdfe5fca23..5b2bea2180 100644 --- a/spark/common/src/main/scala/org/apache/sedona/stats/Util.scala +++ b/spark/common/src/main/scala/org/apache/sedona/util/DfUtils.scala @@ -16,14 +16,14 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.sedona.stats +package org.apache.sedona.util -import org.apache.spark.sql.DataFrame import org.apache.spark.sql.sedona_sql.UDT.GeometryUDT +import org.apache.spark.sql.types.StructType -private[stats] object Util { - def getGeometryColumnName(dataframe: DataFrame): String = { - val geomFields = dataframe.schema.fields.filter(_.dataType == GeometryUDT) +object DfUtils { + def getGeometryColumnName(schema: StructType): String = { + val geomFields = schema.fields.filter(_.dataType == GeometryUDT) if (geomFields.isEmpty) throw new IllegalArgumentException( diff --git a/spark/common/src/main/scala/org/apache/spark/sql/execution/datasource/stac/TemporalFilter.scala b/spark/common/src/main/scala/org/apache/spark/sql/execution/datasource/stac/TemporalFilter.scala new file mode 100644 index 0000000000..e5bfc947ac --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/execution/datasource/stac/TemporalFilter.scala @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.execution.datasource.stac + +import java.time.LocalDateTime + +/* A temporal filter that can be pushed down to the STAC data source. + * It wraps a [[TemporalFilter]] and provides a simple string representation. + * @param temporalFilter + */ +trait TemporalFilter { + def evaluate(columns: Map[String, LocalDateTime]): Boolean + def simpleString: String +} + +object TemporalFilter { + + case class AndFilter(left: TemporalFilter, right: TemporalFilter) extends TemporalFilter { + override def evaluate(columns: Map[String, LocalDateTime]): Boolean = { + left.evaluate(columns) && right.evaluate(columns) + } + + override def simpleString: String = s"(${left.simpleString}) AND (${right.simpleString})" + } + + case class OrFilter(left: TemporalFilter, right: TemporalFilter) extends TemporalFilter { + override def evaluate(columns: Map[String, LocalDateTime]): Boolean = + left.evaluate(columns) || right.evaluate(columns) + override def simpleString: String = s"(${left.simpleString}) OR (${right.simpleString})" + } + + case class LessThanFilter(columnName: String, value: LocalDateTime) extends TemporalFilter { + override def evaluate(columns: Map[String, LocalDateTime]): Boolean = { + columns.get(columnName).exists(_ isBefore value) + } + override def simpleString: String = s"$columnName < $value" + } + + case class GreaterThanFilter(columnName: String, value: LocalDateTime) extends TemporalFilter { + override def evaluate(columns: Map[String, LocalDateTime]): Boolean = { + columns.get(columnName).exists(_ isAfter value) + } + override def simpleString: String = s"$columnName > $value" + } + + case class EqualFilter(columnName: String, value: LocalDateTime) extends TemporalFilter { + override def evaluate(columns: Map[String, LocalDateTime]): Boolean = { + columns.get(columnName).exists(_ isEqual value) + } + override def simpleString: String = s"$columnName = $value" + } +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/adapters/StructuredAdapter.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/adapters/StructuredAdapter.scala new file mode 100644 index 0000000000..70cef8d783 --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/adapters/StructuredAdapter.scala @@ -0,0 +1,233 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.adapters + +import org.apache.sedona.core.spatialRDD.SpatialRDD +import org.apache.sedona.sql.utils.GeometrySerializer +import org.apache.sedona.util.DfUtils +import org.apache.spark.api.java.JavaPairRDD +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.types.{DataType, StructType} +import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.locationtech.jts.geom.Geometry +import org.slf4j.{Logger, LoggerFactory} +import org.apache.sedona.core.spatialPartitioning.GenericUniquePartitioner + +/** + * Adapter for converting between DataFrame and SpatialRDD. It provides methods to convert + * DataFrame to SpatialRDD and vice versa without losing schema. It is different from + * [[org.apache.sedona.sql.utils.Adapter]] which loses the schema information during conversion. + * This should be used if your data starts as a DataFrame and you want to convert it to SpatialRDD + */ +object StructuredAdapter { + val logger: Logger = LoggerFactory.getLogger(getClass) + + /** + * Convert RDD[Row] to SpatialRDD. It puts Row as user data of Geometry. + * @param rdd + * @param geometryFieldName + * @return + */ + def toSpatialRdd(rdd: RDD[Row], geometryFieldName: String): SpatialRDD[Geometry] = { + val spatialRDD = new SpatialRDD[Geometry] + if (rdd.isEmpty()) { + spatialRDD.schema = StructType(Seq()) + } else spatialRDD.schema = rdd.first().schema + spatialRDD.rawSpatialRDD = rdd + .map(row => { + val geom = row.getAs[Geometry](geometryFieldName) + geom.setUserData(row.copy()) + geom + }) + spatialRDD + } + + /** + * Convert RDD[Row] to SpatialRDD. It puts Row as user data of Geometry. It auto-detects + * geometry column if geometryFieldName is not provided. It uses the first geometry column in + * RDD. + * @param rdd + * @return + */ + def toSpatialRdd(rdd: RDD[Row]): SpatialRDD[Geometry] = { + require(rdd.count() > 0, "Input RDD cannot be empty.") + toSpatialRdd(rdd, DfUtils.getGeometryColumnName(rdd.first().schema)) + } + + /** + * Convert SpatialRDD to RDD[Row]. It extracts Row from user data of Geometry. + * @param spatialRDD + * @return + */ + def toRowRdd(spatialRDD: SpatialRDD[Geometry]): RDD[Row] = { + spatialRDD.rawSpatialRDD.map(geometry => { + val row = geometry.getUserData.asInstanceOf[Row] + row + }) + } + + /** + * Convert DataFrame to SpatialRDD. It puts InternalRow as user data of Geometry. It allows only + * one geometry column. + * + * @param dataFrame + * @param geometryFieldName + */ + def toSpatialRdd(dataFrame: DataFrame, geometryFieldName: String): SpatialRDD[Geometry] = { + val spatialRDD = new SpatialRDD[Geometry] + spatialRDD.schema = dataFrame.schema + val ordinal = spatialRDD.schema.fieldIndex(geometryFieldName) + spatialRDD.rawSpatialRDD = dataFrame.queryExecution.toRdd + .map(row => { + val geom = GeometrySerializer.deserialize(row.getBinary(ordinal)) + geom.setUserData(row.copy()) + geom + }) + spatialRDD + } + + /** + * Convert DataFrame to SpatialRDD. It puts InternalRow as user data of Geometry. It + * auto-detects geometry column if geometryFieldName is not provided. It uses the first geometry + * column in DataFrame. + * @param dataFrame + * @return + */ + def toSpatialRdd(dataFrame: DataFrame): SpatialRDD[Geometry] = { + toSpatialRdd(dataFrame, DfUtils.getGeometryColumnName(dataFrame.schema)) + } + + /** + * Convert SpatialRDD.rawSpatialRdd to DataFrame + * @param spatialRDD + * The SpatialRDD to convert. It must have rawSpatialRDD set. + * @param sparkSession + * @return + */ + def toDf(spatialRDD: SpatialRDD[Geometry], sparkSession: SparkSession): DataFrame = { + val rowRdd = spatialRDD.rawSpatialRDD.map(geometry => { + val row = geometry.getUserData.asInstanceOf[InternalRow] + row + }) + sparkSession.internalCreateDataFrame(rowRdd, spatialRDD.schema) + } + + /** + * Convert SpatialRDD.spatialPartitionedRDD to DataFrame This is useful when you want to convert + * SpatialRDD after spatial partitioning. + * @param spatialRDD + * The SpatialRDD to convert. It must have spatialPartitionedRDD set. You must call + * spatialPartitioning method before calling this method. + * @param sparkSession + * @return + */ + def toSpatialPartitionedDf( + spatialRDD: SpatialRDD[Geometry], + sparkSession: SparkSession): DataFrame = { + if (spatialRDD.spatialPartitionedRDD == null) + throw new RuntimeException( + "SpatialRDD is not spatially partitioned. Please call spatialPartitioning method before calling this method.") + + if (!spatialRDD.getPartitioner().isInstanceOf[GenericUniquePartitioner]) { + logger.warn( + "SpatialPartitionedRDD might have duplicate geometries. Please make sure you are aware of it.") + } + val rowRdd = spatialRDD.spatialPartitionedRDD.map(geometry => { + val row = geometry.getUserData.asInstanceOf[InternalRow] + row + }) + sparkSession.internalCreateDataFrame(rowRdd, spatialRDD.schema) + } + + /** + * Convert JavaPairRDD[Geometry, Geometry] to DataFrame This method is useful when you want to + * convert the result of spatial join to DataFrame. + * @param spatialPairRDD + * The JavaPairRDD to convert. + * @param leftSchemaJson + * Schema of the left side. In a json format. + * @param rightSchemaJson + * Schema of the right side. In a json format. + * @param sparkSession + * @return + */ + def toDf( + spatialPairRDD: JavaPairRDD[Geometry, Geometry], + leftSchemaJson: String, + rightSchemaJson: String, + sparkSession: SparkSession): DataFrame = { + val leftSchema = DataType.fromJson(leftSchemaJson).asInstanceOf[StructType] + val rightSchema = DataType.fromJson(rightSchemaJson).asInstanceOf[StructType] + toDf(spatialPairRDD, leftSchema, rightSchema, sparkSession) + } + + /** + * Convert JavaPairRDD[Geometry, Geometry] to DataFrame This method is useful when you want to + * convert the result of spatial join to DataFrame. + * @param spatialPairRDD + * The JavaPairRDD to convert. + * @param leftSchema + * The schema of the left side. + * @param rightSchema + * The schema of the right side. + * @param sparkSession + * @return + */ + def toDf( + spatialPairRDD: JavaPairRDD[Geometry, Geometry], + leftSchema: StructType, + rightSchema: StructType, + sparkSession: SparkSession): DataFrame = { + val rowRdd = spatialPairRDD.rdd.map(pair => { + val leftRow = pair._1.getUserData.asInstanceOf[InternalRow].toSeq(leftSchema) + val rightRow = pair._2.getUserData.asInstanceOf[InternalRow].toSeq(rightSchema) + InternalRow.fromSeq(leftRow ++ rightRow) + }) + sparkSession.internalCreateDataFrame( + rowRdd, + StructType(leftSchema.fields ++ rightSchema.fields)) + } + + /** + * Convert JavaPairRDD[Geometry, Geometry] to DataFrame This method is useful when you want to + * convert the result of spatial join to DataFrame. + * @param spatialPairRDD + * The JavaPairRDD to convert. + * @param originalLeftSpatialRdd + * The original left SpatialRDD involved in the join. It is used to get the schema of the left + * side. + * @param originalRightSpatialRdd + * The original right SpatialRDD involved in the join. It is used to get the schema of the + * right side. + * @param sparkSession + * @return + */ + def toDf( + spatialPairRDD: JavaPairRDD[Geometry, Geometry], + originalLeftSpatialRdd: SpatialRDD[Geometry], + originalRightSpatialRdd: SpatialRDD[Geometry], + sparkSession: SparkSession): DataFrame = { + toDf( + spatialPairRDD, + originalLeftSpatialRdd.schema, + originalRightSpatialRdd.schema, + sparkSession) + } +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/expressions/Functions.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/expressions/Functions.scala index dc8a290b8a..de7e3170ca 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/expressions/Functions.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/expressions/Functions.scala @@ -35,6 +35,17 @@ import org.apache.spark.sql.sedona_sql.expressions.InferrableFunctionConverter._ import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.Utils +case class ST_LabelPoint(inputExpressions: Seq[Expression]) + extends InferredExpression( + inferrableFunction1(Functions.labelPoint), + inferrableFunction2(Functions.labelPoint), + inferrableFunction3(Functions.labelPoint)) { + + protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]) = { + copy(inputExpressions = newChildren) + } +} + /** * Return the distance between two geometries. * @@ -719,6 +730,15 @@ case class ST_MMax(inputExpressions: Seq[Expression]) } } +case class ST_LineSegments(inputExpressions: Seq[Expression]) + extends InferredExpression( + inferrableFunction2(Functions.lineSegments), + inferrableFunction1(Functions.lineSegments)) { + protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]) = { + copy(inputExpressions = newChildren) + } +} + /** * Return a linestring being a substring of the input one starting and ending at the given * fractions of total 2d length. Second and third arguments are Double values between 0 and 1. @@ -1004,6 +1024,17 @@ case class ST_Perimeter(inputExpressions: Seq[Expression]) } } +case class ST_Perimeter2D(inputExpressions: Seq[Expression]) + extends InferredExpression( + inferrableFunction3(Functions.perimeter), + inferrableFunction2(Functions.perimeter), + inferrableFunction1(Functions.perimeter)) { + + protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]) = { + copy(inputExpressions = newChildren) + } +} + case class ST_Points(inputExpressions: Seq[Expression]) extends InferredExpression(Functions.points _) { diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/expressions/GeoStatsFunctions.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/expressions/GeoStatsFunctions.scala new file mode 100644 index 0000000000..8c6b645daf --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/expressions/GeoStatsFunctions.scala @@ -0,0 +1,259 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.expressions + +import org.apache.sedona.core.utils.SedonaConf +import org.apache.sedona.stats.Weighting.{addBinaryDistanceBandColumn, addWeightedDistanceBandColumn} +import org.apache.sedona.stats.clustering.DBSCAN.dbscan +import org.apache.sedona.stats.hotspotDetection.GetisOrd.gLocal +import org.apache.sedona.stats.outlierDetection.LocalOutlierFactor.localOutlierFactor +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, ImplicitCastInputTypes, Literal, ScalarSubquery, Unevaluable} +import org.apache.spark.sql.execution.{LogicalRDD, SparkPlan} +import org.apache.spark.sql.functions.{col, struct} +import org.apache.spark.sql.sedona_sql.UDT.GeometryUDT +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} + +import scala.reflect.ClassTag + +// We mark ST_GeoStatsFunction as non-deterministic to avoid the filter push-down optimization pass +// duplicates the ST_GeoStatsFunction when pushing down aliased ST_GeoStatsFunction through a +// Project operator. This will make ST_GeoStatsFunction being evaluated twice. +trait ST_GeoStatsFunction + extends Expression + with ImplicitCastInputTypes + with Unevaluable + with Serializable { + + final override lazy val deterministic: Boolean = false + + override def nullable: Boolean = true + + private final lazy val sparkSession = SparkSession.getActiveSession.get + + protected final lazy val geometryColumnName = getInputName(0, "geometry") + + protected def getInputName(i: Int, fieldName: String): String = children(i) match { + case ref: AttributeReference => ref.name + case _ => + throw new IllegalArgumentException( + f"$fieldName argument must be a named reference to an existing column") + } + + protected def getInputNames(i: Int, fieldName: String): Seq[String] = children( + i).dataType match { + case StructType(fields) => fields.map(_.name) + case _ => throw new IllegalArgumentException(f"$fieldName argument must be a struct") + } + + protected def getResultName(resultAttrs: Seq[Attribute]): String = resultAttrs match { + case Seq(attr) => attr.name + case _ => throw new IllegalArgumentException("resultAttrs must have exactly one attribute") + } + + protected def doExecute(dataframe: DataFrame, resultAttrs: Seq[Attribute]): DataFrame + + protected def getScalarValue[T](i: Int, name: String)(implicit ct: ClassTag[T]): T = { + children(i) match { + case Literal(l: T, _) => l + case _: Literal => + throw new IllegalArgumentException(f"$name must be an instance of ${ct.runtimeClass}") + case s: ScalarSubquery => + s.eval() match { + case t: T => t + case _ => + throw new IllegalArgumentException( + f"$name must be an instance of ${ct.runtimeClass}") + } + case _ => throw new IllegalArgumentException(f"$name must be a scalar value") + } + } + + def execute(plan: SparkPlan, resultAttrs: Seq[Attribute]): RDD[InternalRow] = { + val df = doExecute( + Dataset.ofRows(sparkSession, LogicalRDD(plan.output, plan.execute())(sparkSession)), + resultAttrs) + df.queryExecution.toRdd + } + +} + +case class ST_DBSCAN(children: Seq[Expression]) extends ST_GeoStatsFunction { + + override def dataType: DataType = StructType( + Seq(StructField("isCore", BooleanType), StructField("cluster", LongType))) + + override def inputTypes: Seq[AbstractDataType] = + Seq(GeometryUDT, DoubleType, IntegerType, BooleanType) + + protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = + copy(children = newChildren) + + override def doExecute(dataframe: DataFrame, resultAttrs: Seq[Attribute]): DataFrame = { + require( + !dataframe.columns.contains("__isCore"), + "__isCore is a reserved name by the dbscan algorithm. Please rename the columns before calling the ST_DBSCAN function.") + require( + !dataframe.columns.contains("__cluster"), + "__cluster is a reserved name by the dbscan algorithm. Please rename the columns before calling the ST_DBSCAN function.") + + dbscan( + dataframe, + getScalarValue[Double](1, "epsilon"), + getScalarValue[Int](2, "minPts"), + geometryColumnName, + SedonaConf.fromActiveSession().getDBSCANIncludeOutliers, + getScalarValue[Boolean](3, "useSpheroid"), + "__isCore", + "__cluster") + .withColumn(getResultName(resultAttrs), struct(col("__isCore"), col("__cluster"))) + .drop("__isCore", "__cluster") + } +} + +case class ST_LocalOutlierFactor(children: Seq[Expression]) extends ST_GeoStatsFunction { + + override def dataType: DataType = DoubleType + + override def inputTypes: Seq[AbstractDataType] = + Seq(GeometryUDT, IntegerType, BooleanType) + + protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = + copy(children = newChildren) + + override def doExecute(dataframe: DataFrame, resultAttrs: Seq[Attribute]): DataFrame = { + localOutlierFactor( + dataframe, + getScalarValue[Int](1, "k"), + geometryColumnName, + SedonaConf.fromActiveSession().isIncludeTieBreakersInKNNJoins, + getScalarValue[Boolean](2, "useSphere"), + getResultName(resultAttrs)) + } +} + +case class ST_GLocal(children: Seq[Expression]) extends ST_GeoStatsFunction { + + override def dataType: DataType = StructType( + Seq( + StructField("G", DoubleType), + StructField("EG", DoubleType), + StructField("VG", DoubleType), + StructField("Z", DoubleType), + StructField("P", DoubleType))) + + override def inputTypes: Seq[AbstractDataType] = { + val xDataType = children(0).dataType + require(xDataType == DoubleType || xDataType == IntegerType, "x must be a numeric value") + Seq( + xDataType, + children(1).dataType, // Array of the weights + BooleanType) + } + + protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = + copy(children = newChildren) + + override def doExecute(dataframe: DataFrame, resultAttrs: Seq[Attribute]): DataFrame = { + gLocal( + dataframe, + getInputName(0, "x"), + getInputName(1, "weights"), + 0, + getScalarValue[Boolean](2, "star"), + 0.0) + .withColumn( + getResultName(resultAttrs), + struct(col("G"), col("EG"), col("VG"), col("Z"), col("P"))) + .drop("G", "EG", "VG", "Z", "P") + } +} + +case class ST_BinaryDistanceBandColumn(children: Seq[Expression]) extends ST_GeoStatsFunction { + override def dataType: DataType = ArrayType( + StructType( + Seq(StructField("neighbor", children(5).dataType), StructField("value", DoubleType)))) + + override def inputTypes: Seq[AbstractDataType] = + Seq(GeometryUDT, DoubleType, BooleanType, BooleanType, BooleanType, children(5).dataType) + + protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = + copy(children = newChildren) + + override def doExecute(dataframe: DataFrame, resultAttrs: Seq[Attribute]): DataFrame = { + val attributeNames = getInputNames(5, "attributes") + require(attributeNames.nonEmpty, "attributes must have at least one column") + require( + attributeNames.contains(geometryColumnName), + "attributes must contain the geometry column") + + addBinaryDistanceBandColumn( + dataframe, + getScalarValue[Double](1, "threshold"), + getScalarValue[Boolean](2, "includeZeroDistanceNeighbors"), + getScalarValue[Boolean](3, "includeSelf"), + geometryColumnName, + getScalarValue[Boolean](4, "useSpheroid"), + attributeNames, + getResultName(resultAttrs)) + } +} + +case class ST_WeightedDistanceBandColumn(children: Seq[Expression]) extends ST_GeoStatsFunction { + + override def dataType: DataType = ArrayType( + StructType( + Seq(StructField("neighbor", children(7).dataType), StructField("value", DoubleType)))) + + override def inputTypes: Seq[AbstractDataType] = + Seq( + GeometryUDT, + DoubleType, + DoubleType, + BooleanType, + BooleanType, + DoubleType, + BooleanType, + children(7).dataType) + + protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = + copy(children = newChildren) + + override def doExecute(dataframe: DataFrame, resultAttrs: Seq[Attribute]): DataFrame = { + val attributeNames = getInputNames(7, "attributes") + require(attributeNames.nonEmpty, "attributes must have at least one column") + require( + attributeNames.contains(geometryColumnName), + "attributes must contain the geometry column") + + addWeightedDistanceBandColumn( + dataframe, + getScalarValue[Double](1, "threshold"), + getScalarValue[Double](2, "alpha"), + getScalarValue[Boolean](3, "includeZeroDistanceNeighbors"), + getScalarValue[Boolean](4, "includeSelf"), + getScalarValue[Double](5, "selfWeight"), + geometryColumnName, + getScalarValue[Boolean](6, "useSpheroid"), + attributeNames, + getResultName(resultAttrs)) + } +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/expressions/st_functions.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/expressions/st_functions.scala index 7bb753cc28..84d555ff64 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/expressions/st_functions.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/expressions/st_functions.scala @@ -45,6 +45,22 @@ object st_functions extends DataFrameAPI { def ST_AddPoint(lineString: String, point: String, index: Int): Column = wrapExpression[ST_AddPoint](lineString, point, index) + def ST_LabelPoint(geometry: Column): Column = + wrapExpression[ST_LabelPoint](geometry) + def ST_LabelPoint(geometry: String): Column = + wrapExpression[ST_LabelPoint](geometry) + def ST_LabelPoint(geometry: Column, gridResolution: Column): Column = + wrapExpression[ST_LabelPoint](geometry, gridResolution) + def ST_LabelPoint(geometry: String, gridResolution: Integer): Column = + wrapExpression[ST_LabelPoint](geometry, gridResolution) + def ST_LabelPoint(geometry: Column, gridResolution: Column, goodnessThreshold: Column): Column = + wrapExpression[ST_LabelPoint](geometry, gridResolution, goodnessThreshold) + def ST_LabelPoint( + geometry: String, + gridResolution: Integer, + goodnessThreshold: Double): Column = + wrapExpression[ST_LabelPoint](geometry, gridResolution, goodnessThreshold) + def ST_Area(geometry: Column): Column = wrapExpression[ST_Area](geometry) def ST_Area(geometry: String): Column = wrapExpression[ST_Area](geometry) @@ -317,6 +333,13 @@ object st_functions extends DataFrameAPI { def ST_LineMerge(multiLineString: String): Column = wrapExpression[ST_LineMerge](multiLineString) + def ST_LineSegments(geom: Column): Column = wrapExpression[ST_LineSegments](geom) + def ST_LineSegments(geom: String): Column = wrapExpression[ST_LineSegments](geom) + def ST_LineSegments(geom: Column, lenient: Column): Column = + wrapExpression[ST_LineSegments](geom, lenient) + def ST_LineSegments(geom: String, lenient: Boolean): Column = + wrapExpression[ST_LineSegments](geom, lenient) + def ST_LineSubstring(lineString: Column, startFraction: Column, endFraction: Column): Column = wrapExpression[ST_LineSubstring](lineString, startFraction, endFraction) def ST_LineSubstring(lineString: String, startFraction: Double, endFraction: Double): Column = @@ -369,6 +392,17 @@ object st_functions extends DataFrameAPI { def ST_Perimeter(geom: String, use_spheroid: Boolean, lenient: Boolean): Column = wrapExpression[ST_Perimeter](geom, use_spheroid, lenient) + def ST_Perimeter2D(geom: Column): Column = wrapExpression[ST_Perimeter2D](geom) + def ST_Perimeter2D(geom: String): Column = wrapExpression[ST_Perimeter2D](geom) + def ST_Perimeter2D(geom: Column, use_spheroid: Column): Column = + wrapExpression[ST_Perimeter2D](geom, use_spheroid) + def ST_Perimeter2D(geom: String, use_spheroid: Boolean): Column = + wrapExpression[ST_Perimeter2D](geom, use_spheroid) + def ST_Perimeter2D(geom: Column, use_spheroid: Column, lenient: Column): Column = + wrapExpression[ST_Perimeter2D](geom, use_spheroid, lenient) + def ST_Perimeter2D(geom: String, use_spheroid: Boolean, lenient: Boolean): Column = + wrapExpression[ST_Perimeter2D](geom, use_spheroid, lenient) + def ST_Points(geom: Column): Column = wrapExpression[ST_Points](geom) def ST_Points(geom: String): Column = wrapExpression[ST_Points](geom) @@ -943,4 +977,46 @@ object st_functions extends DataFrameAPI { def ST_InterpolatePoint(geom1: String, geom2: String): Column = wrapExpression[ST_InterpolatePoint](geom1, geom2) + def ST_DBSCAN(geom: Column, epsilon: Column, minPoints: Column, useSpheroid: Column): Column = + wrapExpression[ST_DBSCAN](geom, epsilon, minPoints, useSpheroid) + + def ST_LocalOutlierFactor(geom: Column, k: Column, useSpheroid: Column): Column = + wrapExpression[ST_LocalOutlierFactor](geom, k, useSpheroid) + + def ST_GLocal(x: Column, weights: Column, star: Column): Column = + wrapExpression[ST_GLocal](x, weights, star) + + def ST_BinaryDistanceBandColumn( + geometry: Column, + threshold: Column, + includeZeroDistanceNeighbors: Column, + includeSelf: Column, + useSpheroid: Column, + attributes: Column): Column = + wrapExpression[ST_BinaryDistanceBandColumn]( + geometry, + threshold, + includeZeroDistanceNeighbors, + includeSelf, + useSpheroid, + attributes) + + def ST_WeightedDistanceBandColumn( + geometry: Column, + threshold: Column, + alpha: Column, + includeZeroDistanceNeighbors: Column, + includeSelf: Column, + selfWeight: Column, + useSpheroid: Column, + attributes: Column): Column = + wrapExpression[ST_BinaryDistanceBandColumn]( + geometry, + threshold, + alpha, + includeZeroDistanceNeighbors, + includeSelf, + selfWeight, + useSpheroid, + attributes) } diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacBatch.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacBatch.scala new file mode 100644 index 0000000000..98cb35ee07 --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacBatch.scala @@ -0,0 +1,264 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.stac + +import com.fasterxml.jackson.databind.{JsonNode, ObjectMapper} +import org.apache.spark.sql.connector.read.{Batch, InputPartition, PartitionReaderFactory} +import org.apache.spark.sql.execution.datasource.stac.TemporalFilter +import org.apache.spark.sql.execution.datasources.parquet.{GeoParquetSpatialFilter, GeometryFieldMetaData} +import org.apache.spark.sql.sedona_sql.io.stac.StacUtils.getNumPartitions +import org.apache.spark.sql.types.StructType + +import java.time.LocalDateTime +import java.time.format.DateTimeFormatterBuilder +import java.time.temporal.ChronoField +import scala.jdk.CollectionConverters._ + +/** + * The `StacBatch` class represents a batch of partitions for reading data in the SpatioTemporal + * Asset Catalog (STAC) data source. It implements the `Batch` interface from Apache Spark's data + * source API. + * + * This class provides methods to plan input partitions and create a partition reader factory, + * which are necessary for batch data processing. + */ +case class StacBatch( + stacCollectionUrl: String, + stacCollectionJson: String, + schema: StructType, + opts: Map[String, String], + spatialFilter: Option[GeoParquetSpatialFilter], + temporalFilter: Option[TemporalFilter]) + extends Batch { + + val mapper = new ObjectMapper() + + /** + * Plans the input partitions for reading data from the STAC data source. + * + * @return + * An array of input partitions for reading STAC data. + */ + override def planInputPartitions(): Array[InputPartition] = { + val stacCollectionBasePath = StacUtils.getStacCollectionBasePath(stacCollectionUrl) + + // Initialize the itemLinks array + val itemLinks = scala.collection.mutable.ArrayBuffer[String]() + + // Start the recursive collection of item links + collectItemLinks(stacCollectionBasePath, stacCollectionJson, itemLinks) + + // Handle when the number of items is less than 1 + if (itemLinks.isEmpty) { + return Array.empty[InputPartition] + } + + val numPartitions = getNumPartitions( + itemLinks.length, + opts.getOrElse("numPartitions", "-1").toInt, + opts.getOrElse("maxPartitionItemFiles", "-1").toInt, + opts.getOrElse("defaultParallelism", "1").toInt) + + // Handle when the number of items is less than the number of partitions + if (itemLinks.length < numPartitions) { + return itemLinks.zipWithIndex.map { case (item, index) => + StacPartition(index, Array(item), new java.util.HashMap[String, String]()) + }.toArray + } + + // Determine how many items to put in each partition + val partitionSize = Math.ceil(itemLinks.length.toDouble / numPartitions).toInt + + // Group the item links into partitions + itemLinks + .grouped(partitionSize) + .zipWithIndex + .map { case (items, index) => + // Create a StacPartition for each group of items + StacPartition(index, items.toArray, new java.util.HashMap[String, String]()) + } + .toArray + } + + /** + * Recursively processes collections and collects item links. + * + * @param collectionBasePath + * The base path of the STAC collection. + * @param collectionJson + * The JSON string representation of the STAC collection. + * @param itemLinks + * The list of item links to populate. + */ + private def collectItemLinks( + collectionBasePath: String, + collectionJson: String, + itemLinks: scala.collection.mutable.ArrayBuffer[String]): Unit = { + // Parse the JSON string into a JsonNode (tree representation of JSON) + val rootNode: JsonNode = mapper.readTree(collectionJson) + + // Extract item links from the "links" array + val linksNode = rootNode.get("links") + val iterator = linksNode.elements() + while (iterator.hasNext) { + val linkNode = iterator.next() + val rel = linkNode.get("rel").asText() + val href = linkNode.get("href").asText() + + // item links are identified by the "rel" value of "item" or "items" + if (rel == "item" || rel == "items") { + // need to handle relative paths and local file paths + val itemUrl = if (href.startsWith("http") || href.startsWith("file")) { + href + } else { + collectionBasePath + href + } + itemLinks += itemUrl // Add the item URL to the list + } else if (rel == "child") { + val childUrl = if (href.startsWith("http") || href.startsWith("file")) { + href + } else { + collectionBasePath + href + } + // Recursively process the linked collection + val linkedCollectionJson = StacUtils.loadStacCollectionToJson(childUrl) + val nestedCollectionBasePath = StacUtils.getStacCollectionBasePath(childUrl) + val collectionFiltered = + filterCollection(linkedCollectionJson, spatialFilter, temporalFilter) + + if (!collectionFiltered) { + collectItemLinks(nestedCollectionBasePath, linkedCollectionJson, itemLinks) + } + } + } + } + + /** + * Filters a collection based on the provided spatial and temporal filters. + * + * @param collectionJson + * The JSON string representation of the STAC collection. + * @param spatialFilter + * The spatial filter to apply to the collection. + * @param temporalFilter + * The temporal filter to apply to the collection. + * @return + * `true` if the collection is filtered out, `false` otherwise. + */ + def filterCollection( + collectionJson: String, + spatialFilter: Option[GeoParquetSpatialFilter], + temporalFilter: Option[TemporalFilter]): Boolean = { + + val mapper = new ObjectMapper() + val rootNode: JsonNode = mapper.readTree(collectionJson) + + // Filter based on spatial extent + val spatialFiltered = spatialFilter match { + case Some(filter) => + val extentNode = rootNode.path("extent").path("spatial").path("bbox") + if (extentNode.isMissingNode) { + false + } else { + val bbox = extentNode + .elements() + .asScala + .map { bboxNode => + val minX = bboxNode.get(0).asDouble() + val minY = bboxNode.get(1).asDouble() + val maxX = bboxNode.get(2).asDouble() + val maxY = bboxNode.get(3).asDouble() + (minX, minY, maxX, maxY) + } + .toList + + !bbox.exists { case (minX, minY, maxX, maxY) => + val geometryTypes = Seq("Polygon") + val bbox = Seq(minX, minY, maxX, maxY) + + val geometryFieldMetaData = GeometryFieldMetaData( + encoding = "WKB", + geometryTypes = geometryTypes, + bbox = bbox, + crs = None, + covering = None) + + filter.evaluate(Map("geometry" -> geometryFieldMetaData)) + } + } + case None => false + } + + // Filter based on temporal extent + val temporalFiltered = temporalFilter match { + case Some(filter) => + val extentNode = rootNode.path("extent").path("temporal").path("interval") + if (extentNode.isMissingNode) { + // if extent is missing, we assume the collection is not filtered + true + } else { + // parse the temporal intervals + val formatter = new DateTimeFormatterBuilder() + .appendPattern("yyyy-MM-dd'T'HH:mm:ss") + .optionalStart() + .appendFraction(ChronoField.MILLI_OF_SECOND, 0, 3, true) + .optionalEnd() + .appendPattern("'Z'") + .toFormatter() + + val intervals = extentNode + .elements() + .asScala + .map { intervalNode => + val start = LocalDateTime.parse(intervalNode.get(0).asText(), formatter) + val end = LocalDateTime.parse(intervalNode.get(1).asText(), formatter) + (start, end) + } + .toList + + // check if the filter evaluates to true for any of the interval start or end times + !intervals.exists { case (start, end) => + filter.evaluate(Map("datetime" -> start)) || + filter.evaluate(Map("datetime" -> end)) + } + } + // if the collection is not filtered, return false + case None => false + } + + spatialFiltered || temporalFiltered + } + + /** + * Creates a partition reader factory for reading data from the STAC data source. + * + * @return + * A partition reader factory for reading STAC data. + */ + override def createReaderFactory(): PartitionReaderFactory = { (partition: InputPartition) => + { + new StacPartitionReader( + partition.asInstanceOf[StacPartition], + schema, + opts, + spatialFilter, + temporalFilter) + } + } +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacDataSource.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacDataSource.scala new file mode 100644 index 0000000000..ac64b8393b --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacDataSource.scala @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.stac + +import StacUtils.{inferStacSchema, updatePropertiesPromotedSchema} +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connector.catalog.{Table, TableProvider} +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.sedona_sql.UDT.GeometryUDT +import org.apache.spark.sql.sedona_sql.io.geojson.GeoJSONUtils +import org.apache.spark.sql.sources.DataSourceRegister +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +import java.util +import java.util.concurrent.ConcurrentHashMap +import scala.jdk.CollectionConverters._ + +/** + * The `StacDataSource` class is responsible for enabling the reading of SpatioTemporal Asset + * Catalogs (STAC) as tables in Apache Spark. It allows integrating geospatial metadata from local + * or remote STAC catalog sources into Spark for processing. + * + * This class implements Apache Spark's `TableProvider` interface to define how STAC data sources + * are converted into Spark tables, and the `DataSourceRegister` interface to provide a custom + * short name for easier data source loading. + */ +class StacDataSource() extends TableProvider with DataSourceRegister { + + // Cache to store inferred schemas + private val schemaCache = new ConcurrentHashMap[Map[String, String], StructType]() + + /** + * Returns the short name of this data source, which can be used in Spark SQL queries for + * loading the data source. For example: + * + * `spark.read.format("stac").load(...)` + * + * @return + * The string identifier for this data source, "stac". + */ + override def shortName(): String = "stac" + + /** + * Infers and returns the schema of the STAC data source. This implementation checks if a local + * cache of the processed STAC collection exists. If not, it processes the STAC collection and + * saves it as a GeoJson file. The schema is then inferred from this GeoJson file. + * + * @param opts + * Mapping of data source options, which should include either 'url' or 'service'. + * @return + * The inferred schema of the STAC data source table. + * @throws IllegalArgumentException + * If neither 'url' nor 'service' are provided. + */ + override def inferSchema(opts: CaseInsensitiveStringMap): StructType = { + val optsMap = opts.asCaseSensitiveMap().asScala.toMap + + // Check if the schema is already cached + val fullSchema = schemaCache.computeIfAbsent(optsMap, _ => inferStacSchema(optsMap)) + val updatedGeometrySchema = GeoJSONUtils.updateGeometrySchema(fullSchema, GeometryUDT) + updatePropertiesPromotedSchema(updatedGeometrySchema) + } + + /** + * Provides a table implementation for the STAC data source based on the input schema and + * configuration properties. This method supports loading STAC catalogs either from a local file + * system or from a remote HTTP/HTTPS endpoint. + * + * @param schema + * The schema of the table, ignored as the schema is pre-defined. + * @param partitioning + * Unused, but represents potential transformations (partitioning) in Spark. + * @param properties + * A map of properties to configure the data source. Must include either "path" for local file + * access or "service" for HTTP access. + * @return + * An instance of `StacTable`, wrapping the parsed STAC catalog JSON data. + * @throws IllegalArgumentException + * If neither "url" nor "service" are provided. + */ + override def getTable( + schema: StructType, + partitioning: Array[Transform], + properties: util.Map[String, String]): Table = { + val opts = new CaseInsensitiveStringMap(properties) + + val optsMap: Map[String, String] = opts.asCaseSensitiveMap().asScala.toMap ++ Map( + "sessionLocalTimeZone" -> SparkSession.active.sessionState.conf.sessionLocalTimeZone, + "columnNameOfCorruptRecord" -> SparkSession.active.sessionState.conf.columnNameOfCorruptRecord, + "defaultParallelism" -> SparkSession.active.sparkContext.defaultParallelism.toString, + "maxPartitionItemFiles" -> SparkSession.active.conf + .get("spark.wherobots.stac.load.maxPartitionItemFiles", "0"), + "numPartitions" -> SparkSession.active.conf + .get("spark.wherobots.stac.load.numPartitions", "-1")) + val stacCollectionJsonString = StacUtils.loadStacCollectionToJson(optsMap) + + new StacTable(stacCollectionJson = stacCollectionJsonString, opts = optsMap) + } +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacPartition.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacPartition.scala new file mode 100644 index 0000000000..5589bd9bad --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacPartition.scala @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.stac + +import org.apache.spark.sql.connector.read.InputPartition + +case class StacPartition(index: Int, items: Array[String], opts: java.util.Map[String, String]) + extends InputPartition diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacPartitionReader.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacPartitionReader.scala new file mode 100644 index 0000000000..4929087db5 --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacPartitionReader.scala @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.stac + +import com.fasterxml.jackson.databind.ObjectMapper +import org.apache.hadoop.conf.Configuration +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.json.JSONOptionsInRead +import org.apache.spark.sql.connector.read.PartitionReader +import org.apache.spark.sql.execution.datasource.stac.TemporalFilter +import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.execution.datasources.json.JsonDataSource +import org.apache.spark.sql.execution.datasources.parquet.GeoParquetSpatialFilter +import org.apache.spark.sql.sedona_sql.io.geojson.{GeoJSONUtils, SparkCompatUtil} +import org.apache.spark.sql.sedona_sql.io.stac.StacUtils.{buildOutDbRasterFields, promotePropertiesToTop} +import org.apache.spark.sql.types.{StringType, StructType} + +import java.io.{File, PrintWriter} +import java.lang.reflect.Constructor +import scala.io.Source + +class StacPartitionReader( + partition: StacPartition, + schema: StructType, + opts: Map[String, String], + spatialFilter: Option[GeoParquetSpatialFilter], + temporalFilter: Option[TemporalFilter]) + extends PartitionReader[InternalRow] { + + private val itemsIterator = partition.items.iterator + private var currentItem: String = _ + private var currentFile: File = _ + private var featureIterator: Iterator[InternalRow] = Iterator.empty + private val mapper = new ObjectMapper() + + override def next(): Boolean = { + if (featureIterator.hasNext) { + true + } else if (itemsIterator.hasNext) { + currentItem = itemsIterator.next() + if (currentItem.startsWith("http://") || currentItem.startsWith("https://") || currentItem + .startsWith("file://")) { + val url = new java.net.URL(currentItem) + + // Download the file to a local temp file + val tempFile = File.createTempFile("stac_item_", ".json") + val writer = new PrintWriter(tempFile) + try { + val fileContent = Source.fromURL(url).mkString + val rootNode = mapper.readTree(fileContent) + val nodeType = rootNode.get("type").asText() + + nodeType match { + case "Feature" => + // Write the content as a single line JSON + val content = mapper.writeValueAsString(rootNode) + writer.write(content) + case "FeatureCollection" => + // Write each feature in the features array to a multi-line JSON file + val features = rootNode.get("features") + val featureIterator = features.elements() + while (featureIterator.hasNext) { + val feature = featureIterator.next() + val content = mapper.writeValueAsString(feature) + writer.write(content) + writer.write("\n") + } + case _ => + throw new IllegalArgumentException(s"Unsupported type for item: $nodeType") + } + + } finally { + writer.close() + } + checkAndDeleteTempFile(currentFile) + currentFile = tempFile + } else { + throw new IllegalArgumentException(s"Unsupported protocol for item: $currentItem") + } + + // Parse the current file and extract features + featureIterator = if (currentFile.exists()) { + + val parsedOptions = new JSONOptionsInRead( + opts, + opts.getOrElse("sessionLocalTimeZone", "UTC"), + opts.getOrElse("columnNameOfCorruptRecord", "_corrupt_record")) + val dataSource = JsonDataSource(parsedOptions) + + val alteredSchema = GeoJSONUtils.updateGeometrySchema(schema, StringType) + + val parser = SparkCompatUtil.constructJacksonParser( + alteredSchema, + parsedOptions, + allowArrayAsStructs = true) + + val rows = SparkCompatUtil + .readFile( + dataSource, + new Configuration(), + createPartitionedFile(currentFile), + parser, + schema) + + rows.map(row => { + val geometryConvertedRow = GeoJSONUtils.convertGeoJsonToGeometry(row, alteredSchema) + val rasterAddedRow = buildOutDbRasterFields(geometryConvertedRow, alteredSchema) + val propertiesPromotedRow = promotePropertiesToTop(rasterAddedRow, alteredSchema) + propertiesPromotedRow + }) + } else { + Iterator.empty + } + + next() + } else { + false + } + } + + override def get(): InternalRow = { + featureIterator.next() + } + + override def close(): Unit = { + checkAndDeleteTempFile(currentFile) + } + + private def checkAndDeleteTempFile(file: File): Unit = { + // Delete the local file if it was downloaded to tmp + if (file != null && file.exists() && file.getAbsolutePath.startsWith( + System.getProperty("java.io.tmpdir"))) { + file.delete() + } + } + + /** + * Create a PartitionedFile instance using reflection. The constructor parameters differ between + * these versions, so we need to handle both cases. For Spark 3.4 and below, the constructor has + * 7 parameters, while for Spark 3.5 and above, it has 8 parameters. Additionally, the type of + * the second parameter may be `SparkPath` in some cases, which requires special handling. + * + * @param currentFile + * The file to create the PartitionedFile for. + * @return + * The created PartitionedFile instance. + * @throws NoSuchMethodException + * If no suitable constructor is found. + */ + def createPartitionedFile(currentFile: File): PartitionedFile = { + val partitionedFileClass = + Class.forName("org.apache.spark.sql.execution.datasources.PartitionedFile") + val constructors = partitionedFileClass.getConstructors + val constructor = constructors + .find(_.getParameterCount == 7) + .getOrElse( + constructors + .find(_.getParameterCount == 8) + .getOrElse( + throw new NoSuchMethodException("No constructor with 7 or 8 parameters found"))) + + val params = if (constructor.getParameterCount == 7) { + val secondParamType = constructor.getParameterTypes()(1) + if (secondParamType.getName == "org.apache.spark.paths.SparkPath") { + Array( + null, + createSparkPath(currentFile.getPath), + java.lang.Long.valueOf(0L), + java.lang.Long.valueOf(currentFile.length()), + Array.empty[String], + java.lang.Long.valueOf(0L), + java.lang.Long.valueOf(0L)) + } else { + Array( + null, + currentFile.getPath, + java.lang.Long.valueOf(0L), + java.lang.Long.valueOf(currentFile.length()), + Array.empty[String], + java.lang.Long.valueOf(0L), + java.lang.Long.valueOf(0L)) + } + } else { + Array( + null, + createSparkPath(currentFile.getPath), + java.lang.Long.valueOf(0L), + java.lang.Long.valueOf(currentFile.length()), + Array.empty[String], + java.lang.Long.valueOf(0L), + java.lang.Long.valueOf(0L), + null) + } + + constructor.newInstance(params: _*).asInstanceOf[PartitionedFile] + } + + /** + * Create a SparkPath instance using reflection. This is needed to support both Spark 3.3 and + * below and Spark 3.4 and above. + * + * @param pathString + * The path to create the SparkPath for. + * @return + * The created SparkPath instance. + */ + def createSparkPath(pathString: String): Object = { + val sparkPathClass = Class.forName("org.apache.spark.paths.SparkPath") + val constructor: Constructor[_] = sparkPathClass.getDeclaredConstructor(classOf[String]) + constructor.setAccessible(true) // Make the private constructor accessible + constructor.newInstance(pathString).asInstanceOf[Object] + } +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacScan.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacScan.scala new file mode 100644 index 0000000000..2edf082912 --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacScan.scala @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.stac + +import org.apache.spark.sql.connector.read.{Batch, Scan} +import org.apache.spark.sql.execution.datasource.stac.TemporalFilter +import org.apache.spark.sql.execution.datasources.parquet.GeoParquetSpatialFilter +import org.apache.spark.sql.internal.connector.SupportsMetadata +import org.apache.spark.sql.sedona_sql.io.stac.StacUtils.{getFullCollectionUrl, inferStacSchema} +import org.apache.spark.sql.types.StructType + +class StacScan(stacCollectionJson: String, opts: Map[String, String]) + extends Scan + with SupportsMetadata { + + // The spatial filter to be pushed down to the data source + var spatialFilter: Option[GeoParquetSpatialFilter] = None + + // The temporal filter to be pushed down to the data source + var temporalFilter: Option[TemporalFilter] = None + + /** + * Returns the schema of the data to be read. + * + * The schema is statically defined in the `StacTable` object. + */ + override def readSchema(): StructType = { + val url = opts.get("path") + val service = opts.get("service") + + if (url == null && service == null) { + throw new IllegalArgumentException("Either 'path' or 'service' must be provided") + } + + inferStacSchema(opts) + } + + /** + * Returns a `Batch` instance for reading the data in batch mode. + * + * The `StacBatch` class provides the implementation for the batch reading. + * + * @return + * A `Batch` instance for batch-based data processing. + */ + override def toBatch: Batch = { + val stacCollectionUrl = getFullCollectionUrl(opts) + StacBatch( + stacCollectionUrl, + stacCollectionJson, + readSchema(), + opts, + spatialFilter, + temporalFilter) + } + + /** + * Sets the spatial predicates to be pushed down to the data source. + * + * @param combinedSpatialFilter + * The combined spatial filter to be pushed down. + */ + def setSpatialPredicates(combinedSpatialFilter: GeoParquetSpatialFilter) = { + spatialFilter = Some(combinedSpatialFilter) + } + + /** + * Sets the temporal predicates to be pushed down to the data source. + * + * @param combineTemporalFilter + * The combined temporal filter to be pushed down. + */ + def setTemporalPredicates(combineTemporalFilter: TemporalFilter) = { + temporalFilter = Some(combineTemporalFilter) + } + + /** + * Returns metadata about the data to be read. + * + * The metadata includes information about the pushed filters. + * + * @return + * A map of metadata key-value pairs. + */ + override def getMetaData(): Map[String, String] = { + Map( + "PushedSpatialFilters" -> spatialFilter.map(_.toString).getOrElse("None"), + "PushedTemporalFilters" -> temporalFilter.map(_.toString).getOrElse("None")) + } + + /** + * Returns a description of the data to be read. + * + * The description includes the metadata information. + * + * @return + * A string description of the data to be read. + */ + override def description(): String = { + super.description() + " " + getMetaData().mkString(", ") + } +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacScanBuilder.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacScanBuilder.scala new file mode 100644 index 0000000000..ebaab87dda --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacScanBuilder.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.stac + +import org.apache.spark.sql.connector.read.{Scan, ScanBuilder} + +/** + * The `StacScanBuilder` class represents the builder for creating a `Scan` instance in the + * SpatioTemporal Asset Catalog (STAC) data source. + * + * This class is responsible for assembling the scan operation for reading STAC data. It acts as a + * bridge between Spark's data source API and the specific implementation of the STAC data read + * operation. + */ +class StacScanBuilder(stacCollectionJson: String, opts: Map[String, String]) extends ScanBuilder { + + /** + * Builds and returns a `Scan` instance. The `Scan` defines the schema and batch reading methods + * for STAC data. + * + * @return + * A `Scan` instance that defines how to read STAC data. + */ + override def build(): Scan = new StacScan(stacCollectionJson, opts) +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacTable.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacTable.scala new file mode 100644 index 0000000000..bd536f6de6 --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacTable.scala @@ -0,0 +1,261 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.stac + +import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability} +import org.apache.spark.sql.connector.read.ScanBuilder +import org.apache.spark.sql.sedona_sql.UDT.GeometryUDT +import org.apache.spark.sql.sedona_sql.io.geojson.GeoJSONUtils +import org.apache.spark.sql.sedona_sql.io.stac.StacUtils.{inferStacSchema, updatePropertiesPromotedSchema} +import org.apache.spark.sql.types._ +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +import java.util.concurrent.ConcurrentHashMap + +/** + * The `StacTable` class represents a table in the SpatioTemporal Asset Catalog (STAC) data + * source. + * + * This class implements the `Table` and `SupportsRead` interfaces to integrate with Apache + * Spark's data source API, providing support for reading data from STAC. + * + * @constructor + * Creates a new instance of the `StacTable` class. + */ +class StacTable(stacCollectionJson: String, opts: Map[String, String]) + extends Table + with SupportsRead { + + // Cache to store inferred schemas + private val schemaCache = new ConcurrentHashMap[Map[String, String], StructType]() + + /** + * Returns the name of the table. + * + * @return + * The name of the table as a string. + */ + override def name(): String = "stac" + + /** + * Defines the schema of the STAC table. + * + * @return + * The schema as a StructType. + */ + override def schema(): StructType = { + // Check if the schema is already cached + val fullSchema = schemaCache.computeIfAbsent(opts, _ => inferStacSchema(opts)) + val updatedGeometrySchema = GeoJSONUtils.updateGeometrySchema(fullSchema, GeometryUDT) + updatePropertiesPromotedSchema(updatedGeometrySchema) + } + + /** + * Indicates the capabilities supported by the STAC table, specifically batch read. + * + * @return + * A set of table capabilities. + */ + override def capabilities(): java.util.Set[TableCapability] = + java.util.EnumSet.of(TableCapability.BATCH_READ) + + /** + * Creates a new scan builder for reading data from the STAC table. + * + * @param options + * The configuration options for the scan. + * @return + * A new instance of ScanBuilder. + */ + override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = + new StacScanBuilder(stacCollectionJson, opts) +} + +object StacTable { + + /** + * Defines the schema of the STAC table, which supports various fields including collection + * information, asset details, geometries, and more. The schema is based on the STAC + * specification version 1.1.0. + */ + val SCHEMA_V1_1_0: StructType = StructType( + Seq( + StructField("stac_version", StringType, nullable = false), + StructField("stac_extensions", ArrayType(StringType), nullable = true), + StructField("type", StringType, nullable = false), + StructField("id", StringType, nullable = false), + StructField("bbox", ArrayType(DoubleType), nullable = true), + StructField( + "geometry", + StructType( + Seq( + StructField("type", StringType, nullable = true), + StructField("coordinates", ArrayType(ArrayType(DoubleType)), nullable = true))), + nullable = true), + StructField( + "properties", + StructType(Seq( + StructField("title", StringType, nullable = true), + StructField("description", StringType, nullable = true), + StructField("datetime", TimestampType, nullable = true), + StructField("start_datetime", TimestampType, nullable = true), + StructField("end_datetime", TimestampType, nullable = true), + StructField("created", TimestampType, nullable = true), + StructField("updated", TimestampType, nullable = true), + StructField("platform", StringType, nullable = true), + StructField("instruments", ArrayType(StringType), nullable = true), + StructField("constellation", StringType, nullable = true), + StructField("mission", StringType, nullable = true), + StructField("gsd", DoubleType, nullable = true))), + nullable = false), + StructField("collection", StringType, nullable = true), + StructField( + "links", + ArrayType(StructType(Seq( + StructField("rel", StringType, nullable = true), + StructField("href", StringType, nullable = true), + StructField("type", StringType, nullable = true), + StructField("title", StringType, nullable = true)))), + nullable = false), + StructField( + "assets", + MapType( + StringType, + StructType(Seq( + StructField("href", StringType, nullable = true), + StructField("type", StringType, nullable = true), + StructField("title", StringType, nullable = true), + StructField("roles", ArrayType(StringType), nullable = true)))), + nullable = false))) + + /** + * Defines the schema of the STAC table, which supports various fields including collection + * information, asset details, geometries, and more. The schema is based on the STAC + * specification version 1.0.0. + */ + val SCHEMA_V1_0_0: StructType = StructType( + Seq( + StructField("stac_version", StringType, nullable = false), + StructField("stac_extensions", ArrayType(StringType), nullable = true), + StructField("type", StringType, nullable = false), + StructField("id", StringType, nullable = false), + StructField("bbox", ArrayType(DoubleType), nullable = true), + StructField( + "geometry", + StructType( + Seq( + StructField("type", StringType, nullable = true), + StructField("coordinates", ArrayType(ArrayType(DoubleType)), nullable = true))), + nullable = true), + StructField( + "properties", + StructType(Seq( + StructField("title", StringType, nullable = true), + StructField("description", StringType, nullable = true), + StructField("datetime", TimestampType, nullable = true), + StructField("start_datetime", TimestampType, nullable = true), + StructField("end_datetime", TimestampType, nullable = true), + StructField("created", TimestampType, nullable = true), + StructField("updated", TimestampType, nullable = true), + StructField("platform", StringType, nullable = true), + StructField("instruments", ArrayType(StringType), nullable = true), + StructField("constellation", StringType, nullable = true), + StructField("mission", StringType, nullable = true), + StructField("gsd", DoubleType, nullable = true))), + nullable = true), + StructField("collection", StringType, nullable = true), + StructField( + "links", + ArrayType(StructType(Seq( + StructField("rel", StringType, nullable = true), + StructField("href", StringType, nullable = true), + StructField("type", StringType, nullable = true), + StructField("title", StringType, nullable = true)))), + nullable = true), + StructField( + "assets", + MapType( + StringType, + StructType(Seq( + StructField("href", StringType, nullable = true), + StructField("type", StringType, nullable = true), + StructField("title", StringType, nullable = true), + StructField("roles", ArrayType(StringType), nullable = true)))), + nullable = true))) + + val SCHEMA_GEOPARQUET: StructType = StructType( + Seq( + StructField("stac_version", StringType, nullable = false), + StructField("stac_extensions", ArrayType(StringType), nullable = true), + StructField("type", StringType, nullable = false), + StructField("id", StringType, nullable = false), + StructField("bbox", ArrayType(DoubleType), nullable = true), + StructField( + "geometry", + StructType( + Seq( + StructField("type", StringType, nullable = true), + StructField("coordinates", ArrayType(ArrayType(DoubleType)), nullable = true))), + nullable = true), + StructField("datetime", TimestampType, nullable = true), + StructField("collection", StringType, nullable = true), + StructField( + "links", + ArrayType(StructType(Seq( + StructField("rel", StringType, nullable = true), + StructField("href", StringType, nullable = true), + StructField("type", StringType, nullable = true), + StructField("title", StringType, nullable = true)))), + nullable = false))) + + def addAssetStruct(schema: StructType, name: String): StructType = { + val assetStruct = StructType( + Seq( + StructField("href", StringType, nullable = true), + StructField("roles", ArrayType(StringType), nullable = true), + StructField("title", StringType, nullable = true), + StructField("type", StringType, nullable = true))) + + val updatedFields = schema.fields.map { + case StructField("assets", existingStruct: StructType, nullable, metadata) => + StructField( + "assets", + StructType(existingStruct.fields :+ StructField(name, assetStruct, nullable = true)), + nullable, + metadata) + case other => other + } + + if (!schema.fieldNames.contains("assets")) { + StructType( + updatedFields :+ StructField( + "assets", + StructType(Seq(StructField(name, assetStruct, nullable = true))), + nullable = true)) + } else { + StructType(updatedFields) + } + } + + def addAssetsStruct(schema: StructType, names: Array[String]): StructType = { + names.foldLeft(schema) { (currentSchema, name) => + addAssetStruct(currentSchema, name) + } + } +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacUtils.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacUtils.scala new file mode 100644 index 0000000000..4e148422bf --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacUtils.scala @@ -0,0 +1,225 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.stac + +import com.fasterxml.jackson.databind.ObjectMapper +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.util.ArrayBasedMapData +import org.apache.spark.sql.types.{MapType, StringType, StructField, StructType} + +import scala.io.Source + +object StacUtils { + + // Function to load JSON from URL or service + def loadStacCollectionToJson(opts: Map[String, String]): String = { + val urlFull: String = getFullCollectionUrl(opts) + + loadStacCollectionToJson(urlFull) + } + + def getFullCollectionUrl(opts: Map[String, String]) = { + val url = opts.getOrElse( + "path", + opts.getOrElse( + "service", + throw new IllegalArgumentException("Either 'path' or 'service' must be provided"))) + val urlFinal = if (url.matches("^[a-zA-Z][a-zA-Z0-9+.-]*://.*")) url else s"file://$url" + urlFinal + } + + // Function to load JSON from URL or service + def loadStacCollectionToJson(url: String): String = { + if (url.startsWith("s3://") || url.startsWith("s3a://")) { + SparkSession.active.read.textFile(url).collect().mkString("\n") + } else { + Source.fromURL(url).mkString + } + } + + // Function to get the base URL from the collection URL or service + def getStacCollectionBasePath(opts: Map[String, String]): String = { + val ref = opts.getOrElse( + "path", + opts.getOrElse( + "service", + throw new IllegalArgumentException("Either 'path' or 'service' must be provided"))) + getStacCollectionBasePath(ref) + } + + // Function to get the base URL from the collection URL or service + def getStacCollectionBasePath(collectionUrl: String): String = { + val urlPattern = "(https?://[^/]+/|http://[^/]+/).*".r + val filePattern = "(file:///.*/|/.*/).*".r + + collectionUrl match { + case urlPattern(baseUrl) => baseUrl + case filePattern(basePath) => + if (basePath.startsWith("file://")) basePath else s"file://$basePath" + case _ => throw new IllegalArgumentException(s"Invalid URL or file path: $collectionUrl") + } + } + + /** + * Infer the schema of the STAC data source table. + * + * This method checks if a cached schema exists for the given data source options. If not, it + * processes the STAC collection and saves it as a GeoJson file. The schema is then inferred + * from this GeoJson file. + * + * @param opts + * Mapping of data source options, which should include either 'url' or 'service'. + * @return + * The inferred schema of the STAC data source table. + * @throws IllegalArgumentException + * If neither 'url' nor 'service' are provided. + */ + def inferStacSchema(opts: Map[String, String]): StructType = { + val stacCollectionJsonString = loadStacCollectionToJson(opts) + + // Create the ObjectMapper + val mapper = new ObjectMapper() + mapper.registerModule(DefaultScalaModule) + + // Parse the STAC collection JSON + val collection = mapper.readTree(stacCollectionJsonString) + + // Extract the stac_version + val stacVersion = collection.get("stac_version").asText() + + // Return the corresponding schema based on the stac_version + stacVersion match { + case "1.0.0" => StacTable.SCHEMA_V1_0_0 + case version if version.matches("1\\.[1-9]\\d*\\.\\d*") => StacTable.SCHEMA_V1_1_0 + // Add more cases here for other versions if needed + case _ => throw new IllegalArgumentException(s"Unsupported STAC version: $stacVersion") + } + } + + /** + * Promote the properties field to the top level of the row. + */ + def promotePropertiesToTop(row: InternalRow, schema: StructType): InternalRow = { + val propertiesIndex = schema.fieldIndex("properties") + val propertiesStruct = schema("properties").dataType.asInstanceOf[StructType] + val propertiesRow = row.getStruct(propertiesIndex, propertiesStruct.fields.length) + + val newValues = schema.fields.zipWithIndex.foldLeft(Seq.empty[Any]) { + case (acc, (field, index)) if field.name == "properties" => + acc ++ propertiesStruct.fields.zipWithIndex.map { case (propField, propIndex) => + propertiesRow.get(propIndex, propField.dataType) + } + case (acc, (_, index)) => + acc :+ row.get(index, schema(index).dataType) + } + + InternalRow.fromSeq(newValues) + } + + def updatePropertiesPromotedSchema(schema: StructType): StructType = { + val propertiesIndex = schema.fieldIndex("properties") + val propertiesStruct = schema("properties").dataType.asInstanceOf[StructType] + + val newFields = schema.fields.foldLeft(Seq.empty[StructField]) { + case (acc, StructField("properties", _, _, _)) => + acc ++ propertiesStruct.fields + case (acc, other) => + acc :+ other + } + + StructType(newFields) + } + + /** + * Builds the output row with the raster field in the assets map. + * + * @param row + * The input row. + * @param schema + * The schema of the input row. + * @return + * The output row with the raster field in the assets map. + */ + def buildOutDbRasterFields(row: InternalRow, schema: StructType): InternalRow = { + val newValues = new Array[Any](schema.fields.length) + + schema.fields.zipWithIndex.foreach { + case (StructField("assets", MapType(StringType, valueType: StructType, _), _, _), index) => + val assetsMap = row.getMap(index) + if (assetsMap != null) { + val updatedAssets = assetsMap + .keyArray() + .array + .zip(assetsMap.valueArray().array) + .map { case (key, value) => + val assetRow = value.asInstanceOf[InternalRow] + if (assetRow != null) { + key -> assetRow + } else { + key -> null + } + } + .toMap + newValues(index) = ArrayBasedMapData(updatedAssets) + } else { + newValues(index) = null + } + case (_, index) => + newValues(index) = row.get(index, schema.fields(index).dataType) + } + + InternalRow.fromSeq(newValues) + } + + /** + * Returns the number of partitions to use for reading the data. + * + * The number of partitions is determined based on the number of items, the number of partitions + * requested, the maximum number of item files per partition, and the default parallelism. + * + * @param itemCount + * The number of items in the collection. + * @param numPartitions + * The number of partitions requested. + * @param maxPartitionItemFiles + * The maximum number of item files per partition. + * @param defaultParallelism + * The default parallelism. + * @return + * The number of partitions to use for reading the data. + */ + def getNumPartitions( + itemCount: Int, + numPartitions: Int, + maxPartitionItemFiles: Int, + defaultParallelism: Int): Int = { + if (numPartitions > 0) { + numPartitions + } else { + val maxSplitFiles = if (maxPartitionItemFiles > 0) { + Math.min(maxPartitionItemFiles, Math.ceil(itemCount.toDouble / defaultParallelism).toInt) + } else { + Math.ceil(itemCount.toDouble / defaultParallelism).toInt + } + Math.max(1, Math.ceil(itemCount.toDouble / maxSplitFiles).toInt) + } + } +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/ExtractGeoStatsFunctions.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/ExtractGeoStatsFunctions.scala new file mode 100644 index 0000000000..6b4cf9ccea --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/ExtractGeoStatsFunctions.scala @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.optimization + +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.sedona_sql.expressions.ST_GeoStatsFunction +import org.apache.spark.sql.sedona_sql.plans.logical.EvalGeoStatsFunction + +import scala.collection.mutable + +/** + * Extracts GeoStats functions from operators, rewriting the query plan so that the geo-stats + * functions can be evaluated alone in its own physical executors. + */ +object ExtractGeoStatsFunctions extends Rule[LogicalPlan] { + var geoStatsResultCount = 0 + + private def collectGeoStatsFunctionsFromExpressions( + expressions: Seq[Expression]): Seq[ST_GeoStatsFunction] = { + def collectGeoStatsFunctions(expr: Expression): Seq[ST_GeoStatsFunction] = expr match { + case expr: ST_GeoStatsFunction => Seq(expr) + case e => e.children.flatMap(collectGeoStatsFunctions) + } + expressions.flatMap(collectGeoStatsFunctions) + } + + def apply(plan: LogicalPlan): LogicalPlan = plan match { + // SPARK-26293: A subquery will be rewritten into join later, and will go through this rule + // eventually. Here we skip subquery, as geo-stats functions only needs to be extracted once. + case s: Subquery if s.correlated => plan + case _ => + plan.transformUp { + case p: EvalGeoStatsFunction => p + case plan: LogicalPlan => extract(plan) + } + } + + private def canonicalizeDeterministic(u: ST_GeoStatsFunction) = { + if (u.deterministic) { + u.canonicalized.asInstanceOf[ST_GeoStatsFunction] + } else { + u + } + } + + /** + * Extract all the geo-stats functions from the current operator and evaluate them before the + * operator. + */ + private def extract(plan: LogicalPlan): LogicalPlan = { + val geoStatsFuncs = plan match { + case e: EvalGeoStatsFunction => + collectGeoStatsFunctionsFromExpressions(e.function.children) + case _ => + ExpressionSet(collectGeoStatsFunctionsFromExpressions(plan.expressions)) + // ignore the ST_GeoStatsFunction that come from second/third aggregate, which is not used + .filter(func => func.references.subsetOf(plan.inputSet)) + .filter(func => + plan.children.exists(child => func.references.subsetOf(child.outputSet))) + .toSeq + .asInstanceOf[Seq[ST_GeoStatsFunction]] + } + + if (geoStatsFuncs.isEmpty) { + // If there aren't any, we are done. + plan + } else { + // Transform the first geo-stats function we have found. We'll call extract recursively later + // to transform the rest. + val geoStatsFunc = geoStatsFuncs.head + + val attributeMap = mutable.HashMap[ST_GeoStatsFunction, Expression]() + // Rewrite the child that has the input required for the UDF + val newChildren = plan.children.map { child => + if (geoStatsFunc.references.subsetOf(child.outputSet)) { + geoStatsResultCount += 1 + val resultAttr = + AttributeReference(f"geoStatsResult$geoStatsResultCount", geoStatsFunc.dataType)() + val evaluation = EvalGeoStatsFunction(geoStatsFunc, Seq(resultAttr), child) + attributeMap += (canonicalizeDeterministic(geoStatsFunc) -> resultAttr) + extract(evaluation) // handle nested geo-stats functions + } else { + child + } + } + + // Replace the geo stats function call with the newly created geoStatsResult attribute + val rewritten = plan.withNewChildren(newChildren).transformExpressions { + case p: ST_GeoStatsFunction => attributeMap.getOrElse(canonicalizeDeterministic(p), p) + } + + // extract remaining geo-stats functions recursively + val newPlan = extract(rewritten) + if (newPlan.output != plan.output) { + // Trim away the new UDF value if it was only used for filtering or something. + Project(plan.output, newPlan) + } else { + newPlan + } + } + } +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialFilterPushDownForGeoParquet.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialFilterPushDownForGeoParquet.scala index ba0ecf8a40..7ef96ac970 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialFilterPushDownForGeoParquet.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialFilterPushDownForGeoParquet.scala @@ -92,7 +92,7 @@ class SpatialFilterPushDownForGeoParquet(sparkSession: SparkSession) extends Rul lr.relation.isInstanceOf[HadoopFsRelation] && lr.relation.asInstanceOf[HadoopFsRelation].fileFormat.isInstanceOf[GeoParquetFileFormatBase] - private def translateToGeoParquetSpatialFilters( + def translateToGeoParquetSpatialFilters( predicates: Seq[Expression]): Seq[GeoParquetSpatialFilter] = { val pushableColumn = PushableColumn(nestedPredicatePushdownEnabled = false) predicates.flatMap { predicate => diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialTemporalFilterPushDownForStacScan.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialTemporalFilterPushDownForStacScan.scala new file mode 100644 index 0000000000..566d368d69 --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialTemporalFilterPushDownForStacScan.scala @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.optimization + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.expressions.{And, EqualTo, Expression, GreaterThan, GreaterThanOrEqual, LessThan, LessThanOrEqual, Literal, Or, SubqueryExpression} +import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan} +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.parseColumnPath +import org.apache.spark.sql.execution.datasource.stac.TemporalFilter +import org.apache.spark.sql.execution.datasource.stac.TemporalFilter.{AndFilter => TemporalAndFilter} +import org.apache.spark.sql.execution.datasources.parquet.GeoParquetSpatialFilter.{AndFilter => SpatialAndFilter} +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation +import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, PushableColumn, PushableColumnBase} +import org.apache.spark.sql.sedona_sql.io.stac.StacScan +import org.apache.spark.sql.sedona_sql.optimization.ExpressionUtils.splitConjunctivePredicates +import org.apache.spark.sql.types.TimestampType + +import java.time.{Instant, LocalDateTime, ZoneOffset} + +/* + * This class is responsible for pushing down spatial filters to the STAC data source. + * It extends and reuses the `SpatialFilterPushDownForGeoParquet` class, which is responsible for pushing down + */ +class SpatialTemporalFilterPushDownForStacScan(sparkSession: SparkSession) + extends SpatialFilterPushDownForGeoParquet(sparkSession) { + + /** + * Pushes down spatial filters to the STAC data source. + * + * @param plan + * The logical plan to optimize. + * @return + * The optimized logical plan with spatial filters pushed down to the STAC data source. + */ + override def apply(plan: LogicalPlan): LogicalPlan = { + val enableSpatialFilterPushDown = + sparkSession.conf.get("spark.sedona.stac.spatialFilterPushDown", "true").toBoolean + if (!enableSpatialFilterPushDown) plan + else { + plan transform { + case filter @ Filter(condition, lr: DataSourceV2ScanRelation) if isStacScanRelation(lr) => + val filters = splitConjunctivePredicates(condition) + val normalizedFilters = DataSourceStrategy.normalizeExprs(filters, lr.output) + val (_, normalizedFiltersWithoutSubquery) = + normalizedFilters.partition(SubqueryExpression.hasSubquery) + // reuse the `translateToGeoParquetSpatialFilters` method from the `SpatialFilterPushDownForGeoParquet` class + val spatialFilters = + translateToGeoParquetSpatialFilters(normalizedFiltersWithoutSubquery) + if (!spatialFilters.isEmpty) { + val combinedSpatialFilter = spatialFilters.reduce(SpatialAndFilter) + val scan = lr.scan.asInstanceOf[StacScan] + // set the spatial predicates in the STAC scan + scan.setSpatialPredicates(combinedSpatialFilter) + filter.copy() + } + val temporalFilters = + translateToTemporalFilters(normalizedFiltersWithoutSubquery) + if (!temporalFilters.isEmpty) { + val combinedTemporalFilter = temporalFilters.reduce(TemporalAndFilter) + val scan = lr.scan.asInstanceOf[StacScan] + // set the spatial predicates in the STAC scan + scan.setTemporalPredicates(combinedTemporalFilter) + filter.copy() + } + filter.copy() + } + } + } + + private def isStacScanRelation(lr: DataSourceV2ScanRelation): Boolean = + lr.scan.isInstanceOf[StacScan] + + def translateToTemporalFilters(predicates: Seq[Expression]): Seq[TemporalFilter] = { + val pushableColumn = PushableColumn(nestedPredicatePushdownEnabled = true) + predicates.flatMap { predicate => + translateToTemporalFilter(predicate, pushableColumn) + } + } + + private def translateToTemporalFilter( + predicate: Expression, + pushableColumn: PushableColumnBase): Option[TemporalFilter] = { + predicate match { + case And(left, right) => + val temporalFilterLeft = translateToTemporalFilter(left, pushableColumn) + val temporalFilterRight = translateToTemporalFilter(right, pushableColumn) + (temporalFilterLeft, temporalFilterRight) match { + case (Some(l), Some(r)) => Some(TemporalFilter.AndFilter(l, r)) + case (Some(l), None) => Some(l) + case (None, Some(r)) => Some(r) + case _ => None + } + + case Or(left, right) => + for { + temporalFilterLeft <- translateToTemporalFilter(left, pushableColumn) + temporalFilterRight <- translateToTemporalFilter(right, pushableColumn) + } yield TemporalFilter.OrFilter(temporalFilterLeft, temporalFilterRight) + + case LessThan(pushableColumn(name), Literal(v, TimestampType)) => + Some( + TemporalFilter + .LessThanFilter( + unquote(name), + LocalDateTime + .ofInstant(Instant.ofEpochMilli(v.asInstanceOf[Long] / 1000), ZoneOffset.UTC))) + + case LessThanOrEqual(pushableColumn(name), Literal(v, TimestampType)) => + Some( + TemporalFilter + .LessThanFilter( + unquote(name), + LocalDateTime + .ofInstant(Instant.ofEpochMilli(v.asInstanceOf[Long] / 1000), ZoneOffset.UTC))) + + case GreaterThan(pushableColumn(name), Literal(v, TimestampType)) => + Some( + TemporalFilter + .GreaterThanFilter( + unquote(name), + LocalDateTime + .ofInstant(Instant.ofEpochMilli(v.asInstanceOf[Long] / 1000), ZoneOffset.UTC))) + + case GreaterThanOrEqual(pushableColumn(name), Literal(v, TimestampType)) => + Some( + TemporalFilter + .GreaterThanFilter( + unquote(name), + LocalDateTime + .ofInstant(Instant.ofEpochMilli(v.asInstanceOf[Long] / 1000), ZoneOffset.UTC))) + + case EqualTo(pushableColumn(name), Literal(v, TimestampType)) => + Some( + TemporalFilter + .EqualFilter( + unquote(name), + LocalDateTime + .ofInstant(Instant.ofEpochMilli(v.asInstanceOf[Long] / 1000), ZoneOffset.UTC))) + + case _ => None + } + } + + private def unquote(name: String): String = { + parseColumnPath(name).mkString(".") + } +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/plans/logical/EvalGeoStatsFunction.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/plans/logical/EvalGeoStatsFunction.scala new file mode 100644 index 0000000000..8daeb0c304 --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/plans/logical/EvalGeoStatsFunction.scala @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.plans.logical + +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.expressions.AttributeSet +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.plans.logical.UnaryNode + +case class EvalGeoStatsFunction( + function: Expression, + resultAttrs: Seq[Attribute], + child: LogicalPlan) + extends UnaryNode { + + override def output: Seq[Attribute] = child.output ++ resultAttrs + + override def producedAttributes: AttributeSet = AttributeSet(resultAttrs) + + override protected def withNewChildInternal(newChild: LogicalPlan): LogicalPlan = + copy(child = newChild) +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/geostats/EvalGeoStatsFunctionExec.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/geostats/EvalGeoStatsFunctionExec.scala new file mode 100644 index 0000000000..fbecb69ec4 --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/geostats/EvalGeoStatsFunctionExec.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.strategy.geostats + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet} +import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} +import org.apache.spark.sql.sedona_sql.expressions.ST_GeoStatsFunction + +case class EvalGeoStatsFunctionExec( + function: ST_GeoStatsFunction, + child: SparkPlan, + resultAttrs: Seq[Attribute]) + extends UnaryExecNode { + + override protected def doExecute(): RDD[InternalRow] = function.execute(child, resultAttrs) + + override def output: Seq[Attribute] = child.output ++ resultAttrs + + override def producedAttributes: AttributeSet = AttributeSet(resultAttrs) + + override protected def withNewChildInternal(newChild: SparkPlan): SparkPlan = + copy(child = newChild) +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/geostats/EvalGeoStatsFunctionStrategy.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/geostats/EvalGeoStatsFunctionStrategy.scala new file mode 100644 index 0000000000..4c10b747a6 --- /dev/null +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/geostats/EvalGeoStatsFunctionStrategy.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.strategy.geostats + +import org.apache.spark.sql.Strategy +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.sedona_sql.plans.logical.EvalGeoStatsFunction +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.sedona_sql.expressions.ST_GeoStatsFunction + +class EvalGeoStatsFunctionStrategy(spark: SparkSession) extends Strategy { + + override def apply(plan: LogicalPlan): Seq[SparkPlan] = { + plan match { + case EvalGeoStatsFunction(function: ST_GeoStatsFunction, resultAttrs, child) => + EvalGeoStatsFunctionExec(function, planLater(child), resultAttrs) :: Nil + case _ => Nil + } + } +} diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/join/BroadcastObjectSideKNNJoinExec.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/join/BroadcastObjectSideKNNJoinExec.scala index 1b21c79e7c..c5777be3c1 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/join/BroadcastObjectSideKNNJoinExec.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/join/BroadcastObjectSideKNNJoinExec.scala @@ -120,7 +120,7 @@ case class BroadcastObjectSideKNNJoinExec( sedonaConf: SedonaConf): Unit = { require(numPartitions > 0, "The number of partitions must be greater than 0.") val kValue: Int = this.k.eval().asInstanceOf[Int] - require(kValue > 0, "The number of neighbors must be greater than 0.") + require(kValue >= 1, "The number of neighbors (k) must be equal or greater than 1.") objectsShapes.setNeighborSampleNumber(kValue) broadcastJoin = true } diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/join/BroadcastQuerySideKNNJoinExec.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/join/BroadcastQuerySideKNNJoinExec.scala index 812bc6e6d6..9ce40c6d42 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/join/BroadcastQuerySideKNNJoinExec.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/join/BroadcastQuerySideKNNJoinExec.scala @@ -127,22 +127,13 @@ case class BroadcastQuerySideKNNJoinExec( sedonaConf: SedonaConf): Unit = { require(numPartitions > 0, "The number of partitions must be greater than 0.") val kValue: Int = this.k.eval().asInstanceOf[Int] - require(kValue > 0, "The number of neighbors must be greater than 0.") + require(kValue >= 1, "The number of neighbors (k) must be equal or greater than 1.") objectsShapes.setNeighborSampleNumber(kValue) - val joinPartitions: Integer = numPartitions - broadcastJoin = false - - // expand the boundary for partition to include both RDDs - objectsShapes.analyze() - queryShapes.analyze() - objectsShapes.boundaryEnvelope.expandToInclude(queryShapes.boundaryEnvelope) - - objectsShapes.spatialPartitioning(GridType.QUADTREE_RTREE, joinPartitions) - queryShapes.spatialPartitioning( - objectsShapes.getPartitioner.asInstanceOf[QuadTreeRTPartitioner].nonOverlappedPartitioner()) - - objectsShapes.buildIndex(IndexType.RTREE, true) + // index the objects on regular partitions (not spatial partitions) + // this avoids the cost of spatial partitioning + objectsShapes.buildIndex(IndexType.RTREE, false) + broadcastJoin = true } /** diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/join/JoinQueryDetector.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/join/JoinQueryDetector.scala index 825855b88c..b89b1adeda 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/join/JoinQueryDetector.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/join/JoinQueryDetector.scala @@ -582,10 +582,21 @@ class JoinQueryDetector(sparkSession: SparkSession) extends Strategy { return Nil } + // validate the k value + val kValue: Int = distance.eval().asInstanceOf[Int] + require(kValue >= 1, "The number of neighbors (k) must be equal or greater than 1.") + val leftShape = children.head val rightShape = children.tail.head - val querySide = getKNNQuerySide(left, leftShape) + val querySide = matchExpressionsToPlans(leftShape, rightShape, left, right) match { + case Some((_, _, false)) => + LeftSide + case Some((_, _, true)) => + RightSide + case None => + Nil + } val objectSidePlan = if (querySide == LeftSide) right else left checkObjectPlanFilterPushdown(objectSidePlan) @@ -711,10 +722,21 @@ class JoinQueryDetector(sparkSession: SparkSession) extends Strategy { if (spatialPredicate == SpatialPredicate.KNN) { { + // validate the k value for KNN join + val kValue: Int = distance.get.eval().asInstanceOf[Int] + require(kValue >= 1, "The number of neighbors (k) must be equal or greater than 1.") + val leftShape = children.head val rightShape = children.tail.head - val querySide = getKNNQuerySide(left, leftShape) + val querySide = matchExpressionsToPlans(leftShape, rightShape, left, right) match { + case Some((_, _, false)) => + LeftSide + case Some((_, _, true)) => + RightSide + case None => + Nil + } val objectSidePlan = if (querySide == LeftSide) right else left checkObjectPlanFilterPushdown(objectSidePlan) @@ -731,7 +753,7 @@ class JoinQueryDetector(sparkSession: SparkSession) extends Strategy { k = distance.get, useApproximate = false, spatialPredicate, - isGeography = false, + isGeography, condition = null, extraCondition = None) :: Nil } else { @@ -746,7 +768,7 @@ class JoinQueryDetector(sparkSession: SparkSession) extends Strategy { k = distance.get, useApproximate = false, spatialPredicate, - isGeography = false, + isGeography, condition = null, extraCondition = None) :: Nil } @@ -857,27 +879,6 @@ class JoinQueryDetector(sparkSession: SparkSession) extends Strategy { } } - /** - * Gets the query and object plans based on the left shape. - * - * This method checks if the left shape is part of the left or right plan and returns the query - * and object plans accordingly. - * - * @param leftShape - * The left shape expression. - * @return - * The join side where the left shape is located. - */ - private def getKNNQuerySide(left: LogicalPlan, leftShape: Expression) = { - val isLeftQuerySide = - left.toString().toLowerCase().contains(leftShape.toString().toLowerCase()) - if (isLeftQuerySide) { - LeftSide - } else { - RightSide - } - } - /** * Check if the given condition is an equi-join between the given plans. This method basically * replicates the logic of diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/join/KNNJoinExec.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/join/KNNJoinExec.scala index 2b9bbfb50b..fdc53d13ce 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/join/KNNJoinExec.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/strategy/join/KNNJoinExec.scala @@ -162,7 +162,7 @@ case class KNNJoinExec( sedonaConf: SedonaConf): Unit = { require(numPartitions > 0, "The number of partitions must be greater than 0.") val kValue: Int = this.k.eval().asInstanceOf[Int] - require(kValue > 0, "The number of neighbors must be greater than 0.") + require(kValue >= 1, "The number of neighbors (k) must be equal or greater than 1.") objectsShapes.setNeighborSampleNumber(kValue) exactSpatialPartitioning(objectsShapes, queryShapes, numPartitions) diff --git a/spark/common/src/test/java/org/apache/sedona/core/spatialOperator/JoinQueryDeduplicationTest.java b/spark/common/src/test/java/org/apache/sedona/core/spatialOperator/JoinQueryDeduplicationTest.java index 14b3af49b0..919d8245ac 100644 --- a/spark/common/src/test/java/org/apache/sedona/core/spatialOperator/JoinQueryDeduplicationTest.java +++ b/spark/common/src/test/java/org/apache/sedona/core/spatialOperator/JoinQueryDeduplicationTest.java @@ -43,7 +43,10 @@ public static void teardown() { sc.stop(); } - /** See https://issues.apache.org/jira/browse/SEDONA-233 */ + /** + * See https://issues.apache.org/jira/browse/SEDONA-233 + */ @Test public void testDeduplication() throws Exception { SpatialRDD leftRDD = new SpatialRDD<>(); diff --git a/spark/common/src/test/java/org/apache/sedona/core/spatialPartitioning/GenericUniquePartitionerTest.java b/spark/common/src/test/java/org/apache/sedona/core/spatialPartitioning/GenericUniquePartitionerTest.java new file mode 100644 index 0000000000..1df270c0a0 --- /dev/null +++ b/spark/common/src/test/java/org/apache/sedona/core/spatialPartitioning/GenericUniquePartitionerTest.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sedona.core.spatialPartitioning; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.Iterator; +import org.apache.commons.collections.IteratorUtils; +import org.junit.Test; +import org.locationtech.jts.geom.Envelope; +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.geom.GeometryFactory; +import scala.Tuple2; + +public class GenericUniquePartitionerTest { + private final GeometryFactory factory = new GeometryFactory(); + + @Test + public void testUniquePartition() throws Exception { + ArrayList grids = new ArrayList(); + grids.add(new Envelope(0, 10, 0, 10)); + grids.add(new Envelope(10, 20, 0, 10)); + grids.add(new Envelope(0, 10, 10, 20)); + grids.add(new Envelope(10, 20, 10, 20)); + + FlatGridPartitioner partitioner = new FlatGridPartitioner(grids); + GenericUniquePartitioner uniquePartitioner = new GenericUniquePartitioner(partitioner); + + assertEquals(partitioner.getGridType(), uniquePartitioner.getGridType()); + assertEquals(partitioner.getGrids(), uniquePartitioner.getGrids()); + + Envelope definitelyHasMultiplePartitions = new Envelope(5, 15, 5, 15); + + Iterator> placedWithDuplicates = + partitioner.placeObject(factory.toGeometry(definitelyHasMultiplePartitions)); + // Because the geometry is not completely contained by any of the partitions, + // it also gets placed in the overflow partition (hence 5, not 4) + assertEquals(5, IteratorUtils.toList(placedWithDuplicates).size()); + + Iterator> placedWithoutDuplicates = + uniquePartitioner.placeObject(factory.toGeometry(definitelyHasMultiplePartitions)); + assertEquals(1, IteratorUtils.toList(placedWithoutDuplicates).size()); + } +} diff --git a/spark/common/src/test/java/org/apache/sedona/core/spatialPartitioning/IndexedGridPartitionerTest.java b/spark/common/src/test/java/org/apache/sedona/core/spatialPartitioning/IndexedGridPartitionerTest.java new file mode 100644 index 0000000000..cedd94eadb --- /dev/null +++ b/spark/common/src/test/java/org/apache/sedona/core/spatialPartitioning/IndexedGridPartitionerTest.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sedona.core.spatialPartitioning; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import junit.framework.TestCase; +import org.junit.Assert; +import org.junit.Test; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Envelope; +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.geom.GeometryFactory; +import scala.Tuple2; + +public class IndexedGridPartitionerTest extends TestCase { + + private List getGrids() { + List grids = new ArrayList<>(); + grids.add(new Envelope(0, 50, 0, 50)); + grids.add(new Envelope(50, 100, 0, 50)); + grids.add(new Envelope(0, 50, 50, 100)); + grids.add(new Envelope(50, 100, 50, 100)); + return grids; + } + + private IndexedGridPartitioner getPartitioner(Boolean preserveUncontainedGeometries) { + return new IndexedGridPartitioner(getGrids(), preserveUncontainedGeometries); + } + + public void testPlaceObjectPreserveContainedGeometries() throws Exception { + IndexedGridPartitioner partitioner = getPartitioner(true); + GeometryFactory geometryFactory = new GeometryFactory(); + Geometry spatialObject = geometryFactory.createPoint(new Coordinate(25, 25)); + Iterator> result = partitioner.placeObject(spatialObject); + + List> resultList = new ArrayList<>(); + result.forEachRemaining(resultList::add); + + Assert.assertFalse(resultList.isEmpty()); + Assert.assertEquals(1, resultList.size()); + Assert.assertEquals(0, (int) resultList.get(0)._1()); + } + + public void testPlaceObjectDoesntPreserveUncontainedGeometries() throws Exception { + IndexedGridPartitioner partitioner = getPartitioner(false); + GeometryFactory geometryFactory = new GeometryFactory(); + Geometry spatialObject = geometryFactory.createPoint(new Coordinate(-25, -25)); + Iterator> result = partitioner.placeObject(spatialObject); + Assert.assertFalse(result.hasNext()); + } + + @Test + public void testGetGrids() { + IndexedGridPartitioner partitioner = getPartitioner(true); + Assert.assertEquals(getGrids(), partitioner.getGrids()); + } + + @Test + public void testNumPartitions() { + IndexedGridPartitioner partitioner = getPartitioner(true); + Assert.assertEquals(5, partitioner.numPartitions()); + + partitioner = getPartitioner(false); + Assert.assertEquals(4, partitioner.numPartitions()); + } + + @Test + public void testEquals() { + IndexedGridPartitioner partitioner = getPartitioner(true); + List grids = new ArrayList<>(); + grids.add(new Envelope(0, 50, 0, 50)); + grids.add(new Envelope(50, 100, 0, 50)); + grids.add(new Envelope(0, 50, 50, 100)); + grids.add(new Envelope(50, 100, 50, 100)); + IndexedGridPartitioner otherPartitioner = new IndexedGridPartitioner(grids, true); + Assert.assertTrue(partitioner.equals(otherPartitioner)); + } +} diff --git a/spark/common/src/test/resources/.gitignore b/spark/common/src/test/resources/.gitignore index 764e830895..958c6de423 100644 --- a/spark/common/src/test/resources/.gitignore +++ b/spark/common/src/test/resources/.gitignore @@ -1,2 +1,3 @@ *.DS_Store real-* +wkb/testSaveAs* diff --git a/spark/common/src/test/resources/datasource_stac/collection-items.json b/spark/common/src/test/resources/datasource_stac/collection-items.json new file mode 100644 index 0000000000..fe42d2d4e2 --- /dev/null +++ b/spark/common/src/test/resources/datasource_stac/collection-items.json @@ -0,0 +1,7204 @@ +{ + "type": "FeatureCollection", + "stac_version": "1.0.0", + "stac_extensions": [], + "context": { + "limit": 10, + "matched": 21436719, + "returned": 10 + }, + "numberMatched": 21436719, + "numberReturned": 10, + "features": [ + { + "type": "Feature", + "stac_version": "1.0.0", + "id": "S2B_T28UEC_20250110T120355_L2A", + "properties": { + "created": "2025-01-10T16:22:52.618Z", + "platform": "sentinel-2b", + "constellation": "sentinel-2", + "instruments": [ + "msi" + ], + "eo:cloud_cover": 26.967385, + "proj:epsg": 32628, + "proj:centroid": { + "lat": 52.31849, + "lon": -13.53342 + }, + "mgrs:utm_zone": 28, + "mgrs:latitude_band": "U", + "mgrs:grid_square": "EC", + "grid:code": "MGRS-28UEC", + "view:azimuth": 118.315984493439, + "view:incidence_angle": 3.15096590200837, + "view:sun_azimuth": 166.250646673544, + "view:sun_elevation": 15.2125487342296, + "s2:tile_id": "S2B_OPER_MSI_L2A_TL_2BPS_20250110T154043_A040990_T28UEC_N05.11", + "s2:degraded_msi_data_percentage": 0.0045, + "s2:nodata_pixel_percentage": 99.030858, + "s2:saturated_defective_pixel_percentage": 0, + "s2:cloud_shadow_percentage": 0, + "s2:vegetation_percentage": 0.000342, + "s2:not_vegetated_percentage": 0.000342, + "s2:water_percentage": 73.031932, + "s2:unclassified_percentage": 0, + "s2:medium_proba_clouds_percentage": 13.981807, + "s2:high_proba_clouds_percentage": 12.049599, + "s2:thin_cirrus_percentage": 0.935978, + "s2:snow_ice_percentage": 0, + "s2:product_type": "S2MSI2A", + "s2:processing_baseline": "05.11", + "s2:product_uri": "S2B_MSIL2A_20250110T120359_N0511_R066_T28UEC_20250110T154043.SAFE", + "s2:generation_time": "2025-01-10T15:40:43.000000Z", + "s2:datatake_id": "GS2B_20250110T120359_040990_N05.11", + "s2:datatake_type": "INS-NOBS", + "s2:datastrip_id": "S2B_OPER_MSI_L2A_DS_2BPS_20250110T154043_S20250110T120355_N05.11", + "s2:reflectance_conversion_factor": 1.03425891111326, + "datetime": "2025-01-10T12:07:02.966000Z", + "earthsearch:payload_id": "roda-sentinel-2-c1-l2a/workflow-sentinel-2-c1-l2a-to-stac/212b523557c5336ebe1c078fc29a2069", + "storage:platform": "AWS", + "storage:region": "us-west-2", + "storage:requester_pays": false, + "processing:software": { + "sentinel-2-c1-l2a-to-stac": "v2024.02.01" + }, + "updated": "2025-01-10T16:22:52.618Z" + }, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [-13.8282447208263, 52.3446542644649], + [-13.7600637715014, 52.3345986073346], + [-13.3911071072422, 52.2712256131467], + [-13.3886306359678, 52.3394760649279], + [-13.8282447208263, 52.3446542644649] + ] + ] + }, + "links": [ + { + "rel": "self", + "type": "application/geo+json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T28UEC_20250110T120355_L2A" + }, + { + "rel": "canonical", + "href": "s3://e84-earth-search-sentinel-data/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/S2B_T28UEC_20250110T120355_L2A.json", + "type": "application/json" + }, + { + "rel": "via", + "href": "s3://sentinel-s2-l2a/tiles/28/U/EC/2025/1/10/0/metadata.xml", + "type": "application/xml", + "title": "Granule Metadata in Sinergize RODA Archive" + }, + { + "rel": "parent", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "collection", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "root", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1" + }, + { + "rel": "thumbnail", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T28UEC_20250110T120355_L2A/thumbnail" + } + ], + "assets": { + "red": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/B04.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red - 10m", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122032a6cbfd123d742d90e53162ae4660586361a5ef85e13429946aa77953065cf5", + "file:size": 2357747, + "roles": [ + "data", + "reflectance" + ] + }, + "green": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/B03.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Green - 10m", + "eo:bands": [ + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122033b3a77113b9fa9931115939f8c3738d00a9b73b0e499d23e21da0bef4696be7", + "file:size": 2371586, + "roles": [ + "data", + "reflectance" + ] + }, + "blue": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/B02.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Blue - 10m", + "eo:bands": [ + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220f68f82bf174b37fc0e5c195c2bb19ed0a2e7d4aa1117636eace4199f49a83845", + "file:size": 2396700, + "roles": [ + "data", + "reflectance" + ] + }, + "visual": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/TCI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color image", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 5800020], + "file:checksum": "122083fa02d60e11ee92c40028343320794d6b341bfeb88a50b47b317226c3cd45dc", + "file:size": 3097977, + "roles": [ + "visual" + ] + }, + "nir": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/B08.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 1 - 10m", + "eo:bands": [ + { + "name": "B08", + "common_name": "nir", + "center_wavelength": 0.842, + "full_width_half_max": 0.145 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122063993cadf2b23b6b877c29387a416c9400d2cbe94092c66dfeea6d6f3851f99c", + "file:size": 2368644, + "roles": [ + "data", + "reflectance" + ] + }, + "swir22": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/B12.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 2.2μm - 20m", + "eo:bands": [ + { + "name": "B12", + "common_name": "swir22", + "center_wavelength": 2.19, + "full_width_half_max": 0.242 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12208c9c4478d4bbb29ce1fd8b233fec7262d91de3535ab7623dca8821d17ce2b92c", + "file:size": 629724, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge2": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/B06.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 2 - 20m", + "eo:bands": [ + { + "name": "B06", + "common_name": "rededge", + "center_wavelength": 0.74, + "full_width_half_max": 0.018 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12202676f8075b28a063e89a9d461f31f8ac6e9a53e4c54091e218b681c8e74fa41f", + "file:size": 638352, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge3": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/B07.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 3 - 20m", + "eo:bands": [ + { + "name": "B07", + "common_name": "rededge", + "center_wavelength": 0.783, + "full_width_half_max": 0.028 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220d44985e336e4999bc99a3d0d6affd8ea0ce1cf89698fe5727a4495dd43a0664c", + "file:size": 639056, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge1": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/B05.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 1 - 20m", + "eo:bands": [ + { + "name": "B05", + "common_name": "rededge", + "center_wavelength": 0.704, + "full_width_half_max": 0.019 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220f4c994b1ae1c5c8c46d95e862e3888199f703615e01f69b86063885f00bfda19", + "file:size": 638484, + "roles": [ + "data", + "reflectance" + ] + }, + "swir16": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/B11.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 1.6μm - 20m", + "eo:bands": [ + { + "name": "B11", + "common_name": "swir16", + "center_wavelength": 1.61, + "full_width_half_max": 0.143 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12204553e83bb568a301ed89a19976ce706da96e62da25cd98a30bf47c788c63fdff", + "file:size": 626909, + "roles": [ + "data", + "reflectance" + ] + }, + "wvp": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/WVP.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Water Vapour (WVP)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "unit": "cm", + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "12204701f12f0a40fbe11da4652e3c66a17355edbe9ede6d7cfb6ccbbf2db9aa95cd", + "file:size": 98743, + "roles": [ + "data" + ] + }, + "nir08": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/B8A.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 2 - 20m", + "eo:bands": [ + { + "name": "B8A", + "common_name": "nir08", + "center_wavelength": 0.865, + "full_width_half_max": 0.033 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220bf5827bf5efdb2e488c95d707d2f3f56ac524578e6f8b7e0bbff01c2bd051cd0", + "file:size": 639283, + "roles": [ + "data", + "reflectance" + ] + }, + "scl": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/SCL.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Scene classification map (SCL)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "1220cfbbc94375ef0b9d9c2844eb52cae66a47b996aa960e104edfe170fa07d70897", + "file:size": 78184, + "roles": [ + "data" + ] + }, + "aot": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/AOT.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Aerosol optical thickness (AOT)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "122075bb84c0519c39937be4e010d502a7973f907792da5c78f0aa0f19e920ae871b", + "file:size": 99487, + "roles": [ + "data" + ] + }, + "coastal": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/B01.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Coastal - 60m", + "eo:bands": [ + { + "name": "B01", + "common_name": "coastal", + "center_wavelength": 0.443, + "full_width_half_max": 0.027 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 499980, 0, -60, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12205a2e22580d4ee310b6a7ec8b72b7a75b0bfa3b4c46806a7338b3ad03ccdea04a", + "file:size": 82752, + "roles": [ + "data", + "reflectance" + ] + }, + "nir09": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/B09.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 3 - 60m", + "eo:bands": [ + { + "name": "B09", + "common_name": "nir09", + "center_wavelength": 0.945, + "full_width_half_max": 0.026 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 499980, 0, -60, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12203b837e2b511157325bc14d889e3826661156f93ac206408d0b0e442f17cc388e", + "file:size": 84972, + "roles": [ + "data", + "reflectance" + ] + }, + "cloud": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/CLD_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Cloud Probabilities", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "12206f6d9989064b0614512729c79f4292df9c9355550924b42a34703756474e1cf1", + "file:size": 120955, + "roles": [ + "data", + "cloud" + ] + }, + "snow": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/SNW_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Snow Probabilities", + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "1220646fd31cc898948d16da2d3fa3672ec809f579a0181870342d5a73f19cf1db37", + "file:size": 53931, + "roles": [ + "data", + "snow-ice" + ] + }, + "preview": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/L2A_PVI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color preview", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "file:checksum": "1220eb97564652f39ae6ab53224b027c7eb549c7de4692cd76cb0805b6c854b28839", + "file:size": 8059, + "roles": [ + "overview" + ] + }, + "granule_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/metadata.xml", + "type": "application/xml", + "file:checksum": "122054cadc787eb9b6700b86b7275aa389d20aa862537769fcd7bd54be08556d3d01", + "file:size": 257414, + "roles": [ + "metadata" + ] + }, + "tileinfo_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/tileInfo.json", + "type": "application/json", + "file:checksum": "12200096a9fdb71ede656661a29442dec78f1269d0c233f5bd8ac1a74f14f3e0e484", + "file:size": 1581, + "roles": [ + "metadata" + ] + }, + "product_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/product_metadata.xml", + "type": "application/xml", + "file:checksum": "1220edf7135e20824122846daebe157b450efaef6c8d92cf2c9a6300f312cd787cca", + "file:size": 54686, + "roles": [ + "metadata" + ] + }, + "thumbnail": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EC/2025/1/S2B_T28UEC_20250110T120355_L2A/L2A_PVI.jpg", + "type": "image/jpeg", + "title": "Thumbnail of preview image", + "file:checksum": "1220b306fe70acae95183a812673cf9e80ca7056bf626f21271dd471e9cda23dcf26", + "file:size": 3156, + "roles": [ + "thumbnail" + ] + } + }, + "bbox": [-13.828245, 52.271226, -13.388631, 52.344654], + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v1.1.0/schema.json", + "https://stac-extensions.github.io/file/v2.1.0/schema.json", + "https://stac-extensions.github.io/grid/v1.1.0/schema.json", + "https://stac-extensions.github.io/mgrs/v1.0.0/schema.json", + "https://stac-extensions.github.io/processing/v1.1.0/schema.json", + "https://stac-extensions.github.io/projection/v1.1.0/schema.json", + "https://stac-extensions.github.io/raster/v1.1.0/schema.json", + "https://stac-extensions.github.io/sentinel-2/v1.0.0/schema.json", + "https://stac-extensions.github.io/storage/v1.0.0/schema.json", + "https://stac-extensions.github.io/view/v1.0.0/schema.json" + ], + "collection": "sentinel-2-c1-l2a" + }, + { + "type": "Feature", + "stac_version": "1.0.0", + "id": "S2B_T28UFC_20250110T120355_L2A", + "properties": { + "created": "2025-01-10T16:24:04.511Z", + "platform": "sentinel-2b", + "constellation": "sentinel-2", + "instruments": [ + "msi" + ], + "eo:cloud_cover": 41.692412, + "proj:epsg": 32628, + "proj:centroid": { + "lat": 52.21432, + "lon": -12.53367 + }, + "mgrs:utm_zone": 28, + "mgrs:latitude_band": "U", + "mgrs:grid_square": "FC", + "grid:code": "MGRS-28UFC", + "view:azimuth": 245.523494629745, + "view:incidence_angle": 3.70536620607632, + "view:sun_azimuth": 167.617359597172, + "view:sun_elevation": 15.4317023114617, + "s2:tile_id": "S2B_OPER_MSI_L2A_TL_2BPS_20250110T154043_A040990_T28UFC_N05.11", + "s2:degraded_msi_data_percentage": 0, + "s2:nodata_pixel_percentage": 81.846112, + "s2:saturated_defective_pixel_percentage": 0, + "s2:cloud_shadow_percentage": 0, + "s2:vegetation_percentage": 0.000018, + "s2:not_vegetated_percentage": 0, + "s2:water_percentage": 58.30757, + "s2:unclassified_percentage": 0, + "s2:medium_proba_clouds_percentage": 17.302617, + "s2:high_proba_clouds_percentage": 22.928596, + "s2:thin_cirrus_percentage": 1.4612, + "s2:snow_ice_percentage": 0, + "s2:product_type": "S2MSI2A", + "s2:processing_baseline": "05.11", + "s2:product_uri": "S2B_MSIL2A_20250110T120359_N0511_R066_T28UFC_20250110T154043.SAFE", + "s2:generation_time": "2025-01-10T15:40:43.000000Z", + "s2:datatake_id": "GS2B_20250110T120359_040990_N05.11", + "s2:datatake_type": "INS-NOBS", + "s2:datastrip_id": "S2B_OPER_MSI_L2A_DS_2BPS_20250110T154043_S20250110T120355_N05.11", + "s2:reflectance_conversion_factor": 1.03425891111326, + "datetime": "2025-01-10T12:07:01.074000Z", + "earthsearch:payload_id": "roda-sentinel-2-c1-l2a/workflow-sentinel-2-c1-l2a-to-stac/8e97d11f28adbcdd059cd9db6b9de885", + "storage:platform": "AWS", + "storage:region": "us-west-2", + "storage:requester_pays": false, + "processing:software": { + "sentinel-2-c1-l2a-to-stac": "v2024.02.01" + }, + "updated": "2025-01-10T16:24:04.511Z" + }, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [-13.5321014711154, 52.3413459269826], + [-13.5336220434666, 52.2953748622493], + [-11.9440745611458, 51.9931444540663], + [-11.9222573449227, 52.3103583189958], + [-13.5321014711154, 52.3413459269826] + ] + ] + }, + "links": [ + { + "rel": "self", + "type": "application/geo+json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T28UFC_20250110T120355_L2A" + }, + { + "rel": "canonical", + "href": "s3://e84-earth-search-sentinel-data/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/S2B_T28UFC_20250110T120355_L2A.json", + "type": "application/json" + }, + { + "rel": "via", + "href": "s3://sentinel-s2-l2a/tiles/28/U/FC/2025/1/10/0/metadata.xml", + "type": "application/xml", + "title": "Granule Metadata in Sinergize RODA Archive" + }, + { + "rel": "parent", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "collection", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "root", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1" + }, + { + "rel": "thumbnail", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T28UFC_20250110T120355_L2A/thumbnail" + } + ], + "assets": { + "red": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/B04.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red - 10m", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 600000, 0, -10, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122035c6d5cbc12a70c69fc6525d5fca8037dff9239727a85394e2f6e31cb50e3cef", + "file:size": 37808553, + "roles": [ + "data", + "reflectance" + ] + }, + "green": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/B03.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Green - 10m", + "eo:bands": [ + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 600000, 0, -10, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12201bc6e8ad7c6cbf3ed5ff4b3e6d2e63b98cd5940ff0484c7999003388ecdaeeeb", + "file:size": 38001328, + "roles": [ + "data", + "reflectance" + ] + }, + "blue": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/B02.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Blue - 10m", + "eo:bands": [ + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 600000, 0, -10, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220b3c02b381f6250bde6d1c40512fb5c248739fab60714b0ed457adfe8bc4b2129", + "file:size": 38281234, + "roles": [ + "data", + "reflectance" + ] + }, + "visual": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/TCI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color image", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 600000, 0, -10, 5800020], + "file:checksum": "12204ff1bdc12c9c16461e1e3a7e17ed3a47bbb0465a1006d2473c4eb341e4232f37", + "file:size": 43688315, + "roles": [ + "visual" + ] + }, + "nir": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/B08.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 1 - 10m", + "eo:bands": [ + { + "name": "B08", + "common_name": "nir", + "center_wavelength": 0.842, + "full_width_half_max": 0.145 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 600000, 0, -10, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220efaaa2b724427b0d9d2a36fde08e20abd94f0dde2c92ebc3cc54b8b818fb3fbd", + "file:size": 38093188, + "roles": [ + "data", + "reflectance" + ] + }, + "swir22": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/B12.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 2.2μm - 20m", + "eo:bands": [ + { + "name": "B12", + "common_name": "swir22", + "center_wavelength": 2.19, + "full_width_half_max": 0.242 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12208c918c9e07cde5d1a0f7582e66329ed068927ecd5f87359efe22f278a578ecb5", + "file:size": 10084992, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge2": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/B06.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 2 - 20m", + "eo:bands": [ + { + "name": "B06", + "common_name": "rededge", + "center_wavelength": 0.74, + "full_width_half_max": 0.018 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220d6ab4c1b664e7e40d8dbc0b589f967c807f9fdc57a26466472f528976c085874", + "file:size": 10346975, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge3": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/B07.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 3 - 20m", + "eo:bands": [ + { + "name": "B07", + "common_name": "rededge", + "center_wavelength": 0.783, + "full_width_half_max": 0.028 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220c8ca8579f2636deafb9de4798957c6c29d0fa9f7e27161f9e193082951c88c5a", + "file:size": 10344767, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge1": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/B05.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 1 - 20m", + "eo:bands": [ + { + "name": "B05", + "common_name": "rededge", + "center_wavelength": 0.704, + "full_width_half_max": 0.019 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12208ed42d02383e816d27d156887fa8ed671c2da73acc32fb622cc81ef767a5850a", + "file:size": 10361731, + "roles": [ + "data", + "reflectance" + ] + }, + "swir16": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/B11.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 1.6μm - 20m", + "eo:bands": [ + { + "name": "B11", + "common_name": "swir16", + "center_wavelength": 1.61, + "full_width_half_max": 0.143 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12200ea27f0c2a0293ca9ba18cf9065ed2a3bebe106370a083d0fd5867bbf8069346", + "file:size": 10076269, + "roles": [ + "data", + "reflectance" + ] + }, + "wvp": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/WVP.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Water Vapour (WVP)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "unit": "cm", + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "1220e988a55b07c65489fb136ac2e87c754824341aa39435abf2162336def50a3463", + "file:size": 120608, + "roles": [ + "data" + ] + }, + "nir08": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/B8A.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 2 - 20m", + "eo:bands": [ + { + "name": "B8A", + "common_name": "nir08", + "center_wavelength": 0.865, + "full_width_half_max": 0.033 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122044588afb918873ed2c6a1a7fc302c6227579cec72e71c72444e51ff34089da56", + "file:size": 10362876, + "roles": [ + "data", + "reflectance" + ] + }, + "scl": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/SCL.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Scene classification map (SCL)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "12206ad27bd37d2e7386ba6d0c2d6b5de04cbb9ad06000cff2a8fa81ce353f0b24c4", + "file:size": 420519, + "roles": [ + "data" + ] + }, + "aot": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/AOT.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Aerosol optical thickness (AOT)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "1220db700e3012812070a263beea2838d3647844e4bd4234420280942c68b847e492", + "file:size": 142297, + "roles": [ + "data" + ] + }, + "coastal": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/B01.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Coastal - 60m", + "eo:bands": [ + { + "name": "B01", + "common_name": "coastal", + "center_wavelength": 0.443, + "full_width_half_max": 0.027 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 600000, 0, -60, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220758926402c8bc702c574e19a3edbdc2a9711017c13e52075afc6cfdddaf2a52e", + "file:size": 1206892, + "roles": [ + "data", + "reflectance" + ] + }, + "nir09": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/B09.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 3 - 60m", + "eo:bands": [ + { + "name": "B09", + "common_name": "nir09", + "center_wavelength": 0.945, + "full_width_half_max": 0.026 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 600000, 0, -60, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12202747d0e7b18642c93285c3acdd13780ba63fc3937f3ba1456ed9b857566f7181", + "file:size": 1261903, + "roles": [ + "data", + "reflectance" + ] + }, + "cloud": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/CLD_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Cloud Probabilities", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "12205d6e64c391bbe597e37aa0c9b3d5ab81d80390a2fd66ff8d36fc906069df0984", + "file:size": 1501376, + "roles": [ + "data", + "cloud" + ] + }, + "snow": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/SNW_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Snow Probabilities", + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "12202a673ea9c389b334d6a5bbcf6700845107fe73852bf22833c50b53642b9dd49c", + "file:size": 53931, + "roles": [ + "data", + "snow-ice" + ] + }, + "preview": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/L2A_PVI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color preview", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "file:checksum": "1220132208c1e4422dfdde88a04d6f52425c455bef66af0351605e2c05521d1c1b30", + "file:size": 57507, + "roles": [ + "overview" + ] + }, + "granule_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/metadata.xml", + "type": "application/xml", + "file:checksum": "1220306690a016b2b6caed07114c113a9b989cb29efb92bda3e31e7ff29c51799fcd", + "file:size": 502351, + "roles": [ + "metadata" + ] + }, + "tileinfo_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/tileInfo.json", + "type": "application/json", + "file:checksum": "122051ce84f2407d3a34458f82d94356c968afd65153d5e826278221962d8bbe9b3d", + "file:size": 1674, + "roles": [ + "metadata" + ] + }, + "product_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/product_metadata.xml", + "type": "application/xml", + "file:checksum": "1220dce37ab88abf5d29cd31f1753966c4e7bda509523d5ce5cc5e15878cea96c75f", + "file:size": 55550, + "roles": [ + "metadata" + ] + }, + "thumbnail": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/FC/2025/1/S2B_T28UFC_20250110T120355_L2A/L2A_PVI.jpg", + "type": "image/jpeg", + "title": "Thumbnail of preview image", + "file:checksum": "122094252c5c1668db3d539b946f87167780a67844fe61f32c83baf41c53fae255c8", + "file:size": 11406, + "roles": [ + "thumbnail" + ] + } + }, + "bbox": [-13.533622, 51.993144, -11.922257, 52.341346], + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v1.1.0/schema.json", + "https://stac-extensions.github.io/file/v2.1.0/schema.json", + "https://stac-extensions.github.io/grid/v1.1.0/schema.json", + "https://stac-extensions.github.io/mgrs/v1.0.0/schema.json", + "https://stac-extensions.github.io/processing/v1.1.0/schema.json", + "https://stac-extensions.github.io/projection/v1.1.0/schema.json", + "https://stac-extensions.github.io/raster/v1.1.0/schema.json", + "https://stac-extensions.github.io/sentinel-2/v1.0.0/schema.json", + "https://stac-extensions.github.io/storage/v1.0.0/schema.json", + "https://stac-extensions.github.io/view/v1.0.0/schema.json" + ], + "collection": "sentinel-2-c1-l2a" + }, + { + "type": "Feature", + "stac_version": "1.0.0", + "id": "S2B_T28UDD_20250110T120355_L2A", + "properties": { + "created": "2025-01-10T16:23:17.223Z", + "platform": "sentinel-2b", + "constellation": "sentinel-2", + "instruments": [ + "msi" + ], + "eo:cloud_cover": 45.573676, + "proj:epsg": 32628, + "proj:centroid": { + "lat": 52.78244, + "lon": -14.96778 + }, + "mgrs:utm_zone": 28, + "mgrs:latitude_band": "U", + "mgrs:grid_square": "DD", + "grid:code": "MGRS-28UDD", + "view:azimuth": 111.363270762582, + "view:incidence_angle": 11.2176659476937, + "view:sun_azimuth": 164.877643157552, + "view:sun_elevation": 14.1102990134553, + "s2:tile_id": "S2B_OPER_MSI_L2A_TL_2BPS_20250110T154043_A040990_T28UDD_N05.11", + "s2:degraded_msi_data_percentage": 0, + "s2:nodata_pixel_percentage": 91.37699, + "s2:saturated_defective_pixel_percentage": 0, + "s2:cloud_shadow_percentage": 0, + "s2:vegetation_percentage": 0.000846, + "s2:not_vegetated_percentage": 0, + "s2:water_percentage": 54.425478, + "s2:unclassified_percentage": 0, + "s2:medium_proba_clouds_percentage": 12.064364, + "s2:high_proba_clouds_percentage": 32.520658, + "s2:thin_cirrus_percentage": 0.988655, + "s2:snow_ice_percentage": 0, + "s2:product_type": "S2MSI2A", + "s2:processing_baseline": "05.11", + "s2:product_uri": "S2B_MSIL2A_20250110T120359_N0511_R066_T28UDD_20250110T154043.SAFE", + "s2:generation_time": "2025-01-10T15:40:43.000000Z", + "s2:datatake_id": "GS2B_20250110T120359_040990_N05.11", + "s2:datatake_type": "INS-NOBS", + "s2:datastrip_id": "S2B_OPER_MSI_L2A_DS_2BPS_20250110T154043_S20250110T120355_N05.11", + "s2:reflectance_conversion_factor": 1.03425891111326, + "datetime": "2025-01-10T12:06:59.248000Z", + "earthsearch:payload_id": "roda-sentinel-2-c1-l2a/workflow-sentinel-2-c1-l2a-to-stac/af0f2bf0eea44a4ea693a69087331d8e", + "storage:platform": "AWS", + "storage:region": "us-west-2", + "storage:requester_pays": false, + "processing:software": { + "sentinel-2-c1-l2a-to-stac": "v2024.02.01" + }, + "updated": "2025-01-10T16:23:17.223Z" + }, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [-14.883934957153, 53.249560984131], + [-15.191183625645, 52.5424009271242], + [-14.8562491296129, 52.4966018865447], + [-14.8537381455861, 53.2495277635177], + [-14.883934957153, 53.249560984131] + ] + ] + }, + "links": [ + { + "rel": "self", + "type": "application/geo+json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T28UDD_20250110T120355_L2A" + }, + { + "rel": "canonical", + "href": "s3://e84-earth-search-sentinel-data/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/S2B_T28UDD_20250110T120355_L2A.json", + "type": "application/json" + }, + { + "rel": "via", + "href": "s3://sentinel-s2-l2a/tiles/28/U/DD/2025/1/10/0/metadata.xml", + "type": "application/xml", + "title": "Granule Metadata in Sinergize RODA Archive" + }, + { + "rel": "parent", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "collection", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "root", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1" + }, + { + "rel": "thumbnail", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T28UDD_20250110T120355_L2A/thumbnail" + } + ], + "assets": { + "red": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/B04.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red - 10m", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 399960, 0, -10, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122054a5a778414a5dcf84cedb25f35f7d742e6f7a3a90a39f9f0efb8aa73997ee3d", + "file:size": 17308393, + "roles": [ + "data", + "reflectance" + ] + }, + "green": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/B03.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Green - 10m", + "eo:bands": [ + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 399960, 0, -10, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122057746c62fa7de4da7c715ed1f9a32daf2a756aeffadb091cd885884fb4bbaa2d", + "file:size": 17453898, + "roles": [ + "data", + "reflectance" + ] + }, + "blue": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/B02.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Blue - 10m", + "eo:bands": [ + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 399960, 0, -10, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220f66c150b7d0072381550f6daf7bbd2845705d3b7459db07f4c99359e35561598", + "file:size": 17687244, + "roles": [ + "data", + "reflectance" + ] + }, + "visual": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/TCI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color image", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 399960, 0, -10, 5900040], + "file:checksum": "12201b16ef75a69922ab923ef55e87362bc0e8e12c25aa7131d1db37d20c12978c16", + "file:size": 17154534, + "roles": [ + "visual" + ] + }, + "nir": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/B08.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 1 - 10m", + "eo:bands": [ + { + "name": "B08", + "common_name": "nir", + "center_wavelength": 0.842, + "full_width_half_max": 0.145 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 399960, 0, -10, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220d6c16cc1335ba85c3cde38b65161d43120f6f210a29e5ac3bd29992f10db0061", + "file:size": 17347684, + "roles": [ + "data", + "reflectance" + ] + }, + "swir22": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/B12.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 2.2μm - 20m", + "eo:bands": [ + { + "name": "B12", + "common_name": "swir22", + "center_wavelength": 2.19, + "full_width_half_max": 0.242 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220fadc97c30a1a1471f272b5eb1b62bca1c09a010cc7bf2218a782df05f7844592", + "file:size": 4577358, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge2": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/B06.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 2 - 20m", + "eo:bands": [ + { + "name": "B06", + "common_name": "rededge", + "center_wavelength": 0.74, + "full_width_half_max": 0.018 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220a79316d379277821cc95628beee1bb62d33db354f26aa7693126d19237dfbc99", + "file:size": 4788245, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge3": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/B07.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 3 - 20m", + "eo:bands": [ + { + "name": "B07", + "common_name": "rededge", + "center_wavelength": 0.783, + "full_width_half_max": 0.028 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220cad9f61dc28368c2db5efb069e2948bdd433476a4ba1e8744035abe07910d4b7", + "file:size": 4801599, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge1": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/B05.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 1 - 20m", + "eo:bands": [ + { + "name": "B05", + "common_name": "rededge", + "center_wavelength": 0.704, + "full_width_half_max": 0.019 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220a6c66252a5437bb34de51dcee0552c8c102cbe62839037b7b509bd3d53cb6476", + "file:size": 4819787, + "roles": [ + "data", + "reflectance" + ] + }, + "swir16": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/B11.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 1.6μm - 20m", + "eo:bands": [ + { + "name": "B11", + "common_name": "swir16", + "center_wavelength": 1.61, + "full_width_half_max": 0.143 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12209d4fb63d60fdbb1a60cbc1e7489affee9c45556fc06cebbf01bf9c82bfd98f34", + "file:size": 4580867, + "roles": [ + "data", + "reflectance" + ] + }, + "wvp": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/WVP.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Water Vapour (WVP)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "unit": "cm", + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "122023974cded5bb589228df181ca430c8c9873a298e40e6abeda080b716042c135c", + "file:size": 120033, + "roles": [ + "data" + ] + }, + "nir08": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/B8A.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 2 - 20m", + "eo:bands": [ + { + "name": "B8A", + "common_name": "nir08", + "center_wavelength": 0.865, + "full_width_half_max": 0.033 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220952b926daeefadbbf02d6abb78c13adb3cb6daf998d0ce3ee3932c51fdfa3c07", + "file:size": 4791768, + "roles": [ + "data", + "reflectance" + ] + }, + "scl": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/SCL.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Scene classification map (SCL)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "1220a6ef3402bad56ffe7bcfe50f81b1ada6ce752623c3b2437896c5f7abaf52f82a", + "file:size": 184466, + "roles": [ + "data" + ] + }, + "aot": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/AOT.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Aerosol optical thickness (AOT)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "12204118b1e0a9d86bfe943e2af1de7e7cbfdde5bce35cd19597d3cad795906ff15a", + "file:size": 133584, + "roles": [ + "data" + ] + }, + "coastal": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/B01.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Coastal - 60m", + "eo:bands": [ + { + "name": "B01", + "common_name": "coastal", + "center_wavelength": 0.443, + "full_width_half_max": 0.027 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 399960, 0, -60, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220c57b36366bb26cff43ba58f29db4d2e7da27ad9793fe9fb1c37c25f1af4af5e4", + "file:size": 562940, + "roles": [ + "data", + "reflectance" + ] + }, + "nir09": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/B09.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 3 - 60m", + "eo:bands": [ + { + "name": "B09", + "common_name": "nir09", + "center_wavelength": 0.945, + "full_width_half_max": 0.026 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 399960, 0, -60, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220eeed3e0cebbb75aa936744142ace9c12dd6a82f5ba3acebcde9c97ddbb9f932a", + "file:size": 577302, + "roles": [ + "data", + "reflectance" + ] + }, + "cloud": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/CLD_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Cloud Probabilities", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "1220410f661fbe8e0a4220db2d2e14d9e8b0d148466c6e42745218e1a82561af8d60", + "file:size": 523213, + "roles": [ + "data", + "cloud" + ] + }, + "snow": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/SNW_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Snow Probabilities", + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "1220b71d1bd3c5b571cbded0862e4f4785ad1517371019d7a93177f3156c2ecebea7", + "file:size": 53931, + "roles": [ + "data", + "snow-ice" + ] + }, + "preview": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/L2A_PVI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color preview", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "file:checksum": "1220bfa6a79b1279b8892a2a9297157d630645b78a29e97179329fb723a4c9f47654", + "file:size": 25349, + "roles": [ + "overview" + ] + }, + "granule_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/metadata.xml", + "type": "application/xml", + "file:checksum": "122011626ced3f3137673cce60b87bcf1a209ad4ac5db04feb55806e90e3ea5f5701", + "file:size": 105054, + "roles": [ + "metadata" + ] + }, + "tileinfo_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/tileInfo.json", + "type": "application/json", + "file:checksum": "122083498a89e0c8f200b5222497bad130a56c06ef24ef7c3ba3e236267aa963011b", + "file:size": 1535, + "roles": [ + "metadata" + ] + }, + "product_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/product_metadata.xml", + "type": "application/xml", + "file:checksum": "1220007c793944fcc6a5856a609c3d7a90a7981441c0c68bcc8e42d6da314802db15", + "file:size": 54651, + "roles": [ + "metadata" + ] + }, + "thumbnail": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/DD/2025/1/S2B_T28UDD_20250110T120355_L2A/L2A_PVI.jpg", + "type": "image/jpeg", + "title": "Thumbnail of preview image", + "file:checksum": "1220e9c0f4836898f69022ec8dc67ae68ac26e772723680385d5606261fcff8d3b9a", + "file:size": 5672, + "roles": [ + "thumbnail" + ] + } + }, + "bbox": [-15.191184, 52.496602, -14.853738, 53.249561], + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v1.1.0/schema.json", + "https://stac-extensions.github.io/file/v2.1.0/schema.json", + "https://stac-extensions.github.io/grid/v1.1.0/schema.json", + "https://stac-extensions.github.io/mgrs/v1.0.0/schema.json", + "https://stac-extensions.github.io/processing/v1.1.0/schema.json", + "https://stac-extensions.github.io/projection/v1.1.0/schema.json", + "https://stac-extensions.github.io/raster/v1.1.0/schema.json", + "https://stac-extensions.github.io/sentinel-2/v1.0.0/schema.json", + "https://stac-extensions.github.io/storage/v1.0.0/schema.json", + "https://stac-extensions.github.io/view/v1.0.0/schema.json" + ], + "collection": "sentinel-2-c1-l2a" + }, + { + "type": "Feature", + "stac_version": "1.0.0", + "id": "S2B_T28UGC_20250110T120355_L2A", + "properties": { + "created": "2025-01-10T16:24:15.582Z", + "platform": "sentinel-2b", + "constellation": "sentinel-2", + "instruments": [ + "msi" + ], + "eo:cloud_cover": 70.551813, + "proj:epsg": 32628, + "proj:centroid": { + "lat": 52.10021, + "lon": -11.44977 + }, + "mgrs:utm_zone": 28, + "mgrs:latitude_band": "U", + "mgrs:grid_square": "GC", + "grid:code": "MGRS-28UGC", + "view:azimuth": 288.991145822068, + "view:incidence_angle": 8.88645376535856, + "view:sun_azimuth": 168.984702494377, + "view:sun_elevation": 15.6474414959517, + "s2:tile_id": "S2B_OPER_MSI_L2A_TL_2BPS_20250110T154043_A040990_T28UGC_N05.11", + "s2:degraded_msi_data_percentage": 0.0257, + "s2:nodata_pixel_percentage": 70.794016, + "s2:saturated_defective_pixel_percentage": 0, + "s2:cloud_shadow_percentage": 0, + "s2:vegetation_percentage": 0.000011, + "s2:not_vegetated_percentage": 0.000125, + "s2:water_percentage": 29.44805, + "s2:unclassified_percentage": 0, + "s2:medium_proba_clouds_percentage": 19.48228, + "s2:high_proba_clouds_percentage": 50.357151, + "s2:thin_cirrus_percentage": 0.712383, + "s2:snow_ice_percentage": 0, + "s2:product_type": "S2MSI2A", + "s2:processing_baseline": "05.11", + "s2:product_uri": "S2B_MSIL2A_20250110T120359_N0511_R066_T28UGC_20250110T154043.SAFE", + "s2:generation_time": "2025-01-10T15:40:43.000000Z", + "s2:datatake_id": "GS2B_20250110T120359_040990_N05.11", + "s2:datatake_type": "INS-NOBS", + "s2:datastrip_id": "S2B_OPER_MSI_L2A_DS_2BPS_20250110T154043_S20250110T120355_N05.11", + "s2:reflectance_conversion_factor": 1.03425891111326, + "datetime": "2025-01-10T12:06:59.032000Z", + "earthsearch:payload_id": "roda-sentinel-2-c1-l2a/workflow-sentinel-2-c1-l2a-to-stac/b262a7540f8c30c340c7c4d8c2b08a7b", + "storage:platform": "AWS", + "storage:region": "us-west-2", + "storage:requester_pays": false, + "processing:software": { + "sentinel-2-c1-l2a-to-stac": "v2024.02.01" + }, + "updated": "2025-01-10T16:24:15.582Z" + }, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [-12.0663701781865, 52.3140284614992], + [-12.0854674480038, 52.0229565808749], + [-11.0571384840821, 51.7940576782695], + [-10.8034742021281, 52.2758584353317], + [-12.0663701781865, 52.3140284614992] + ] + ] + }, + "links": [ + { + "rel": "self", + "type": "application/geo+json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T28UGC_20250110T120355_L2A" + }, + { + "rel": "canonical", + "href": "s3://e84-earth-search-sentinel-data/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/S2B_T28UGC_20250110T120355_L2A.json", + "type": "application/json" + }, + { + "rel": "via", + "href": "s3://sentinel-s2-l2a/tiles/28/U/GC/2025/1/10/0/metadata.xml", + "type": "application/xml", + "title": "Granule Metadata in Sinergize RODA Archive" + }, + { + "rel": "parent", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "collection", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "root", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1" + }, + { + "rel": "thumbnail", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T28UGC_20250110T120355_L2A/thumbnail" + } + ], + "assets": { + "red": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/B04.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red - 10m", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 699960, 0, -10, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12201cd9c10ddd829fac7b1648dd387ad4fbb1bca16f50c5e3c4fefe23407a9c4011", + "file:size": 58207551, + "roles": [ + "data", + "reflectance" + ] + }, + "green": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/B03.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Green - 10m", + "eo:bands": [ + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 699960, 0, -10, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220451b51412527a211547b7f1fb65618ab70152c0b5876a28710d82c7afb07228e", + "file:size": 58118240, + "roles": [ + "data", + "reflectance" + ] + }, + "blue": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/B02.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Blue - 10m", + "eo:bands": [ + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 699960, 0, -10, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122029c3cfa284c85df6ebfaa6b0225ca33c147c2666175b9faab17d726468c25684", + "file:size": 58042746, + "roles": [ + "data", + "reflectance" + ] + }, + "visual": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/TCI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color image", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 699960, 0, -10, 5800020], + "file:checksum": "1220569a22ecc77cb50b68399d0587f0d4b6d7912694d646593166e712e03088fe8a", + "file:size": 42064723, + "roles": [ + "visual" + ] + }, + "nir": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/B08.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 1 - 10m", + "eo:bands": [ + { + "name": "B08", + "common_name": "nir", + "center_wavelength": 0.842, + "full_width_half_max": 0.145 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 699960, 0, -10, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122086b8c7c05c080220c58c7837aa5cb177148c998ed3e20bcbc82b5f4483c08de2", + "file:size": 58822385, + "roles": [ + "data", + "reflectance" + ] + }, + "swir22": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/B12.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 2.2μm - 20m", + "eo:bands": [ + { + "name": "B12", + "common_name": "swir22", + "center_wavelength": 2.19, + "full_width_half_max": 0.242 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 699960, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220923638898bfc9c9f7146c955a4a57585779d374044ba21a7158e9810629894c4", + "file:size": 16095523, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge2": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/B06.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 2 - 20m", + "eo:bands": [ + { + "name": "B06", + "common_name": "rededge", + "center_wavelength": 0.74, + "full_width_half_max": 0.018 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 699960, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12200d043c3f0dcbee9be9b5e6c1bc5885ecaebb32730b97758c7a058f824b30ae00", + "file:size": 17051737, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge3": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/B07.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 3 - 20m", + "eo:bands": [ + { + "name": "B07", + "common_name": "rededge", + "center_wavelength": 0.783, + "full_width_half_max": 0.028 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 699960, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12201496c6d8c6a667ce1068dcb750ce1ac590966a54524102be8fcfa41c1c97b95d", + "file:size": 17021702, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge1": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/B05.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 1 - 20m", + "eo:bands": [ + { + "name": "B05", + "common_name": "rededge", + "center_wavelength": 0.704, + "full_width_half_max": 0.019 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 699960, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220943299479e9fc7167e046c58922a0cd43468693b4e14a8086ebba2643c7501a1", + "file:size": 17061778, + "roles": [ + "data", + "reflectance" + ] + }, + "swir16": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/B11.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 1.6μm - 20m", + "eo:bands": [ + { + "name": "B11", + "common_name": "swir16", + "center_wavelength": 1.61, + "full_width_half_max": 0.143 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 699960, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220fa82c7aa49224096f09453a532e579f12fb83c7b57eb5efb3e01d7f86d87309f", + "file:size": 16432588, + "roles": [ + "data", + "reflectance" + ] + }, + "wvp": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/WVP.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Water Vapour (WVP)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 699960, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "unit": "cm", + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "1220819c7233d673f986b8d77c116ec375ed68ac1f9d11665a3edfe513c73d26805e", + "file:size": 133428, + "roles": [ + "data" + ] + }, + "nir08": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/B8A.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 2 - 20m", + "eo:bands": [ + { + "name": "B8A", + "common_name": "nir08", + "center_wavelength": 0.865, + "full_width_half_max": 0.033 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 699960, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122008817be7d0e498c3ca35dc04ec5299db2f686150c71ac4e4d6c2219896656f68", + "file:size": 17052752, + "roles": [ + "data", + "reflectance" + ] + }, + "scl": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/SCL.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Scene classification map (SCL)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 699960, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "12201be991eef66b3ee9fb9993040702d74dc890ee3bc5f13767c779dfbbe49f3004", + "file:size": 645587, + "roles": [ + "data" + ] + }, + "aot": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/AOT.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Aerosol optical thickness (AOT)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 699960, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "1220633bfaa1c34b320443dbb216a3479c1e75b634b9785145005de6cf6130d864b9", + "file:size": 201548, + "roles": [ + "data" + ] + }, + "coastal": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/B01.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Coastal - 60m", + "eo:bands": [ + { + "name": "B01", + "common_name": "coastal", + "center_wavelength": 0.443, + "full_width_half_max": 0.027 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 699960, 0, -60, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12207141b1f48d8faf012d0acb1beebbf23b5c3df7c4b9a167d8d73ef7d15ebff353", + "file:size": 2012692, + "roles": [ + "data", + "reflectance" + ] + }, + "nir09": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/B09.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 3 - 60m", + "eo:bands": [ + { + "name": "B09", + "common_name": "nir09", + "center_wavelength": 0.945, + "full_width_half_max": 0.026 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 699960, 0, -60, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12202e70675c4b9ab5a47692cd176acea1f68e5d1c435edda8e5c97c66bb775ddb0d", + "file:size": 2032167, + "roles": [ + "data", + "reflectance" + ] + }, + "cloud": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/CLD_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Cloud Probabilities", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 699960, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "1220f14c816e5e28cba274f817a6a1a078ca93a33b79f8c6b5d2805f13f302df01a6", + "file:size": 2676520, + "roles": [ + "data", + "cloud" + ] + }, + "snow": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/SNW_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Snow Probabilities", + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 699960, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "122074b684a20156d8958e2cc54b9b3740f5217f825466c66d10e8df379e946e5a96", + "file:size": 53931, + "roles": [ + "data", + "snow-ice" + ] + }, + "preview": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/L2A_PVI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color preview", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "file:checksum": "1220f1f49f585c5c9d162e3fde0c6524393af24d0f98bd94008bf153e0d1f0d84545", + "file:size": 66287, + "roles": [ + "overview" + ] + }, + "granule_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/metadata.xml", + "type": "application/xml", + "file:checksum": "1220cd75a8a624f29c0f31c90df9f791b916a3e495ea7c087b579e2e09770559a655", + "file:size": 351063, + "roles": [ + "metadata" + ] + }, + "tileinfo_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/tileInfo.json", + "type": "application/json", + "file:checksum": "1220d1c9a251b350d7c950c3be70977893a5b2c10c2e124fb0b66093ed644cab670e", + "file:size": 1607, + "roles": [ + "metadata" + ] + }, + "product_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/product_metadata.xml", + "type": "application/xml", + "file:checksum": "12208208152cf7b8ff0461a8f98b958c6b601c8d7e2b0daf2a6da54fcee0188d250d", + "file:size": 55288, + "roles": [ + "metadata" + ] + }, + "thumbnail": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/GC/2025/1/S2B_T28UGC_20250110T120355_L2A/L2A_PVI.jpg", + "type": "image/jpeg", + "title": "Thumbnail of preview image", + "file:checksum": "12204514017e6fc351907beb978f8bf0d1e782b2f3f48280443825827d0e5218ff1f", + "file:size": 13689, + "roles": [ + "thumbnail" + ] + } + }, + "bbox": [-12.085467, 51.794058, -10.803474, 52.314028], + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v1.1.0/schema.json", + "https://stac-extensions.github.io/file/v2.1.0/schema.json", + "https://stac-extensions.github.io/grid/v1.1.0/schema.json", + "https://stac-extensions.github.io/mgrs/v1.0.0/schema.json", + "https://stac-extensions.github.io/processing/v1.1.0/schema.json", + "https://stac-extensions.github.io/projection/v1.1.0/schema.json", + "https://stac-extensions.github.io/raster/v1.1.0/schema.json", + "https://stac-extensions.github.io/sentinel-2/v1.0.0/schema.json", + "https://stac-extensions.github.io/storage/v1.0.0/schema.json", + "https://stac-extensions.github.io/view/v1.0.0/schema.json" + ], + "collection": "sentinel-2-c1-l2a" + }, + { + "type": "Feature", + "stac_version": "1.0.0", + "id": "S2B_T29ULT_20250110T120355_L2A", + "properties": { + "created": "2025-01-10T16:24:35.264Z", + "platform": "sentinel-2b", + "constellation": "sentinel-2", + "instruments": [ + "msi" + ], + "eo:cloud_cover": 76.681787, + "proj:epsg": 32629, + "proj:centroid": { + "lat": 52.11257, + "lon": -11.37077 + }, + "mgrs:utm_zone": 29, + "mgrs:latitude_band": "U", + "mgrs:grid_square": "LT", + "grid:code": "MGRS-29ULT", + "view:azimuth": 289.739549819918, + "view:incidence_angle": 9.25309699449269, + "view:sun_azimuth": 169.170970252047, + "view:sun_elevation": 15.6313798177967, + "s2:tile_id": "S2B_OPER_MSI_L2A_TL_2BPS_20250110T154043_A040990_T29ULT_N05.11", + "s2:degraded_msi_data_percentage": 0.0281, + "s2:nodata_pixel_percentage": 71.433568, + "s2:saturated_defective_pixel_percentage": 0, + "s2:cloud_shadow_percentage": 0, + "s2:vegetation_percentage": 0.000012, + "s2:not_vegetated_percentage": 0.000186, + "s2:water_percentage": 23.318014, + "s2:unclassified_percentage": 0, + "s2:medium_proba_clouds_percentage": 35.262308, + "s2:high_proba_clouds_percentage": 41.088226, + "s2:thin_cirrus_percentage": 0.331256, + "s2:snow_ice_percentage": 0, + "s2:product_type": "S2MSI2A", + "s2:processing_baseline": "05.11", + "s2:product_uri": "S2B_MSIL2A_20250110T120359_N0511_R066_T29ULT_20250110T154043.SAFE", + "s2:generation_time": "2025-01-10T15:40:43.000000Z", + "s2:datatake_id": "GS2B_20250110T120359_040990_N05.11", + "s2:datatake_type": "INS-NOBS", + "s2:datastrip_id": "S2B_OPER_MSI_L2A_DS_2BPS_20250110T154043_S20250110T120355_N05.11", + "s2:reflectance_conversion_factor": 1.03425891111326, + "datetime": "2025-01-10T12:06:58.560000Z", + "earthsearch:payload_id": "roda-sentinel-2-c1-l2a/workflow-sentinel-2-c1-l2a-to-stac/cc5f14a543c20f0fbb2d8a793a17b244", + "storage:platform": "AWS", + "storage:region": "us-west-2", + "storage:requester_pays": false, + "processing:software": { + "sentinel-2-c1-l2a-to-stac": "v2024.02.01" + }, + "updated": "2025-01-10T16:24:35.264Z" + }, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [-11.9341865273641, 52.3140146232143], + [-11.9127080253672, 51.9863614962068], + [-11.0571957464118, 51.7939912466001], + [-10.7711148716862, 52.3371889236931], + [-11.9341865273641, 52.3140146232143] + ] + ] + }, + "links": [ + { + "rel": "self", + "type": "application/geo+json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T29ULT_20250110T120355_L2A" + }, + { + "rel": "canonical", + "href": "s3://e84-earth-search-sentinel-data/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/S2B_T29ULT_20250110T120355_L2A.json", + "type": "application/json" + }, + { + "rel": "via", + "href": "s3://sentinel-s2-l2a/tiles/29/U/LT/2025/1/10/0/metadata.xml", + "type": "application/xml", + "title": "Granule Metadata in Sinergize RODA Archive" + }, + { + "rel": "parent", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "collection", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "root", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1" + }, + { + "rel": "thumbnail", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T29ULT_20250110T120355_L2A/thumbnail" + } + ], + "assets": { + "red": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/B04.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red - 10m", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 300000, 0, -10, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12203e227546657330900c61d6557c4750c3262eb5eeb70edd161878f0db44cd933f", + "file:size": 56941462, + "roles": [ + "data", + "reflectance" + ] + }, + "green": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/B03.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Green - 10m", + "eo:bands": [ + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 300000, 0, -10, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122025246795e56b26d4f7d9fc31b995680db8d55ba9ad507ebc922be4509769f6c9", + "file:size": 56786069, + "roles": [ + "data", + "reflectance" + ] + }, + "blue": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/B02.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Blue - 10m", + "eo:bands": [ + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 300000, 0, -10, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122039d4d6a228946d25de5f8480487f6c492398ed7d8a2c991648d04d3cd85a9935", + "file:size": 56623575, + "roles": [ + "data", + "reflectance" + ] + }, + "visual": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/TCI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color image", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 300000, 0, -10, 5800020], + "file:checksum": "122010e77d3a354175a4f00e427377412b14cb594c0ef55f2744eee10062fc94494d", + "file:size": 36438161, + "roles": [ + "visual" + ] + }, + "nir": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/B08.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 1 - 10m", + "eo:bands": [ + { + "name": "B08", + "common_name": "nir", + "center_wavelength": 0.842, + "full_width_half_max": 0.145 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 300000, 0, -10, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12203493df70a2489aae8e8c51256ab1b149136d0399aeff8bb521944d0a373ffb8e", + "file:size": 57557157, + "roles": [ + "data", + "reflectance" + ] + }, + "swir22": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/B12.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 2.2μm - 20m", + "eo:bands": [ + { + "name": "B12", + "common_name": "swir22", + "center_wavelength": 2.19, + "full_width_half_max": 0.242 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 300000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12207a1e16f696777c391dc53358b0a0892061539000855005f8cffb5ef1a00659c4", + "file:size": 15781560, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge2": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/B06.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 2 - 20m", + "eo:bands": [ + { + "name": "B06", + "common_name": "rededge", + "center_wavelength": 0.74, + "full_width_half_max": 0.018 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 300000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12203277ff67ac82a6260dd05378d0880c66af3280495382175f61b0b152cb72e671", + "file:size": 16820810, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge3": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/B07.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 3 - 20m", + "eo:bands": [ + { + "name": "B07", + "common_name": "rededge", + "center_wavelength": 0.783, + "full_width_half_max": 0.028 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 300000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220ce612d9136bc91eb41bc1ac94486ce6c46c6fd80f6a3c2370bb165b54e22d264", + "file:size": 16799086, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge1": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/B05.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 1 - 20m", + "eo:bands": [ + { + "name": "B05", + "common_name": "rededge", + "center_wavelength": 0.704, + "full_width_half_max": 0.019 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 300000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220f90ef2d43a3d5a38ccfa2dc7bbcebef1241ee73f8cd34f016f9fd1dd4195050c", + "file:size": 16823874, + "roles": [ + "data", + "reflectance" + ] + }, + "swir16": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/B11.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 1.6μm - 20m", + "eo:bands": [ + { + "name": "B11", + "common_name": "swir16", + "center_wavelength": 1.61, + "full_width_half_max": 0.143 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 300000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12206eaa8e87752b95598c4ad90eaead667c44881553ded86b19479eda4aa18b47fc", + "file:size": 16192304, + "roles": [ + "data", + "reflectance" + ] + }, + "wvp": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/WVP.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Water Vapour (WVP)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 300000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "unit": "cm", + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "1220549c7f1f9556aa87f2e9e2306077af27a4871edd6f23a7b7483a945def3a3eb9", + "file:size": 134178, + "roles": [ + "data" + ] + }, + "nir08": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/B8A.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 2 - 20m", + "eo:bands": [ + { + "name": "B8A", + "common_name": "nir08", + "center_wavelength": 0.865, + "full_width_half_max": 0.033 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 300000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12207d766a2efd3de74d5956711abca5765dd5734ca28672459eb4deb45421228eb7", + "file:size": 16836512, + "roles": [ + "data", + "reflectance" + ] + }, + "scl": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/SCL.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Scene classification map (SCL)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 300000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "1220cef0ac90be0d43aa4a3a7f4e32f86304227b58c4e1d6f12398f02ede7d0f1916", + "file:size": 661235, + "roles": [ + "data" + ] + }, + "aot": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/AOT.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Aerosol optical thickness (AOT)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 300000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "12206d45dd57516e4491befa01b0fdc88fb8b4ac6672e9b61e302b6f8f42bc5979de", + "file:size": 198595, + "roles": [ + "data" + ] + }, + "coastal": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/B01.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Coastal - 60m", + "eo:bands": [ + { + "name": "B01", + "common_name": "coastal", + "center_wavelength": 0.443, + "full_width_half_max": 0.027 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 300000, 0, -60, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220ddc842e5e2933d01235e0bb88ea63c867ce228316a0c5277f7cbf6f1b3221769", + "file:size": 1991573, + "roles": [ + "data", + "reflectance" + ] + }, + "nir09": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/B09.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 3 - 60m", + "eo:bands": [ + { + "name": "B09", + "common_name": "nir09", + "center_wavelength": 0.945, + "full_width_half_max": 0.026 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 300000, 0, -60, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220225bec7e6941aba12271c19bbe8d9c5c14ea3fc71262b2d4b75880c53ac1ee39", + "file:size": 2016382, + "roles": [ + "data", + "reflectance" + ] + }, + "cloud": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/CLD_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Cloud Probabilities", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 300000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "1220a755be67c7393e95e506909fdcfd568f6d43a6fa42cbccfc5232584d02d1eaa4", + "file:size": 2162795, + "roles": [ + "data", + "cloud" + ] + }, + "snow": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/SNW_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Snow Probabilities", + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 300000, 0, -20, 5800020], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "122099280d6b49f45f813f5c9a54c08d31cae51069d59d9558d551878d41e7dab826", + "file:size": 370705, + "roles": [ + "data", + "snow-ice" + ] + }, + "preview": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/L2A_PVI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color preview", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "file:checksum": "12204c088e206ddaf24ab7fcba7afb6b533fab045b82c4272f76aa21f56f3c4af391", + "file:size": 60549, + "roles": [ + "overview" + ] + }, + "granule_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/metadata.xml", + "type": "application/xml", + "file:checksum": "12201c67cbc583f92626907c855af5e64967e8c9137a7c1c633f260c38900e5b8f6e", + "file:size": 350629, + "roles": [ + "metadata" + ] + }, + "tileinfo_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/tileInfo.json", + "type": "application/json", + "file:checksum": "12205a7be274328fa33fbbb3aa6c6c482371a65b51770862aaafe3c67fcc4860ab32", + "file:size": 1568, + "roles": [ + "metadata" + ] + }, + "product_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/product_metadata.xml", + "type": "application/xml", + "file:checksum": "1220d52627f80e6fe4575c7128c6971309760e7f6097a1c1965ffb298cc4bebefb83", + "file:size": 55102, + "roles": [ + "metadata" + ] + }, + "thumbnail": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/LT/2025/1/S2B_T29ULT_20250110T120355_L2A/L2A_PVI.jpg", + "type": "image/jpeg", + "title": "Thumbnail of preview image", + "file:checksum": "1220360fbeaa6ce13b08877966ad223cdaf36a8bae092899879eba5da47e1dcf3523", + "file:size": 13517, + "roles": [ + "thumbnail" + ] + } + }, + "bbox": [-11.934187, 51.793991, -10.771115, 52.337189], + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v1.1.0/schema.json", + "https://stac-extensions.github.io/file/v2.1.0/schema.json", + "https://stac-extensions.github.io/grid/v1.1.0/schema.json", + "https://stac-extensions.github.io/mgrs/v1.0.0/schema.json", + "https://stac-extensions.github.io/processing/v1.1.0/schema.json", + "https://stac-extensions.github.io/projection/v1.1.0/schema.json", + "https://stac-extensions.github.io/raster/v1.1.0/schema.json", + "https://stac-extensions.github.io/sentinel-2/v1.0.0/schema.json", + "https://stac-extensions.github.io/storage/v1.0.0/schema.json", + "https://stac-extensions.github.io/view/v1.0.0/schema.json" + ], + "collection": "sentinel-2-c1-l2a" + }, + { + "type": "Feature", + "stac_version": "1.0.0", + "id": "S2B_T29UMU_20250110T120355_L2A", + "properties": { + "created": "2025-01-10T16:22:10.669Z", + "platform": "sentinel-2b", + "constellation": "sentinel-2", + "instruments": [ + "msi" + ], + "eo:cloud_cover": 99.851912, + "proj:epsg": 32629, + "proj:centroid": { + "lat": 53.11473, + "lon": -10.41947 + }, + "mgrs:utm_zone": 29, + "mgrs:latitude_band": "U", + "mgrs:grid_square": "MU", + "grid:code": "MGRS-29UMU", + "view:azimuth": 295.133344244557, + "view:incidence_angle": 11.4258216022895, + "view:sun_azimuth": 170.514168318609, + "view:sun_elevation": 14.876218695379, + "s2:tile_id": "S2B_OPER_MSI_L2A_TL_2BPS_20250110T154043_A040990_T29UMU_N05.11", + "s2:degraded_msi_data_percentage": 0, + "s2:nodata_pixel_percentage": 97.318125, + "s2:saturated_defective_pixel_percentage": 0, + "s2:cloud_shadow_percentage": 0, + "s2:vegetation_percentage": 0, + "s2:not_vegetated_percentage": 0.001979, + "s2:water_percentage": 0.146106, + "s2:unclassified_percentage": 0, + "s2:medium_proba_clouds_percentage": 9.566632, + "s2:high_proba_clouds_percentage": 90.11963, + "s2:thin_cirrus_percentage": 0.165652, + "s2:snow_ice_percentage": 0, + "s2:product_type": "S2MSI2A", + "s2:processing_baseline": "05.11", + "s2:product_uri": "S2B_MSIL2A_20250110T120359_N0511_R066_T29UMU_20250110T154043.SAFE", + "s2:generation_time": "2025-01-10T15:40:43.000000Z", + "s2:datatake_id": "GS2B_20250110T120359_040990_N05.11", + "s2:datatake_type": "INS-NOBS", + "s2:datastrip_id": "S2B_OPER_MSI_L2A_DS_2BPS_20250110T154043_S20250110T120355_N05.11", + "s2:reflectance_conversion_factor": 1.03425891111326, + "datetime": "2025-01-10T12:06:38.977000Z", + "earthsearch:payload_id": "roda-sentinel-2-c1-l2a/workflow-sentinel-2-c1-l2a-to-stac/d7d4c025f2d4f3b3e51a14d9d98f1575", + "storage:platform": "AWS", + "storage:region": "us-west-2", + "storage:requester_pays": false, + "processing:software": { + "sentinel-2-c1-l2a-to-stac": "v2024.02.01" + }, + "updated": "2025-01-10T16:22:10.669Z" + }, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [-10.4990442948644, 53.2401920169908], + [-10.4859475114454, 52.8611868644894], + [-10.2734217675276, 53.242816076468], + [-10.4990442948644, 53.2401920169908] + ] + ] + }, + "links": [ + { + "rel": "self", + "type": "application/geo+json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T29UMU_20250110T120355_L2A" + }, + { + "rel": "canonical", + "href": "s3://e84-earth-search-sentinel-data/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/S2B_T29UMU_20250110T120355_L2A.json", + "type": "application/json" + }, + { + "rel": "via", + "href": "s3://sentinel-s2-l2a/tiles/29/U/MU/2025/1/10/0/metadata.xml", + "type": "application/xml", + "title": "Granule Metadata in Sinergize RODA Archive" + }, + { + "rel": "parent", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "collection", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "root", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1" + }, + { + "rel": "thumbnail", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T29UMU_20250110T120355_L2A/thumbnail" + } + ], + "assets": { + "red": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/B04.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red - 10m", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 399960, 0, -10, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220d309c47c8b7acb6ae4c1e17ec394013535cc1013c9c9735d1f83e257272bb3aa", + "file:size": 5111486, + "roles": [ + "data", + "reflectance" + ] + }, + "green": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/B03.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Green - 10m", + "eo:bands": [ + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 399960, 0, -10, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12207891e5d0515df9add88bbe4bd1fe100fbcaf720e34972134abc213b78c02b076", + "file:size": 5100143, + "roles": [ + "data", + "reflectance" + ] + }, + "blue": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/B02.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Blue - 10m", + "eo:bands": [ + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 399960, 0, -10, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12202be80f386072c1c9af35f23f2dad5abb0ea0670c275a50bee9fa47a65a97437b", + "file:size": 5092633, + "roles": [ + "data", + "reflectance" + ] + }, + "visual": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/TCI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color image", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 399960, 0, -10, 5900040], + "file:checksum": "1220101e3fd7208a1cb7deec2cb6d8d27db55beeefe9e348787feb9cf0d8ddc67da8", + "file:size": 562065, + "roles": [ + "visual" + ] + }, + "nir": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/B08.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 1 - 10m", + "eo:bands": [ + { + "name": "B08", + "common_name": "nir", + "center_wavelength": 0.842, + "full_width_half_max": 0.145 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 399960, 0, -10, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122012b1bd512d34f36373af0856da95c7f8a4bd87e3608736484c5244e4159d3012", + "file:size": 5187449, + "roles": [ + "data", + "reflectance" + ] + }, + "swir22": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/B12.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 2.2μm - 20m", + "eo:bands": [ + { + "name": "B12", + "common_name": "swir22", + "center_wavelength": 2.19, + "full_width_half_max": 0.242 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12208f7d6fe1f69da7d1e94610dece34be45ea32a49e47577c644090eaf0bcb22611", + "file:size": 1596928, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge2": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/B06.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 2 - 20m", + "eo:bands": [ + { + "name": "B06", + "common_name": "rededge", + "center_wavelength": 0.74, + "full_width_half_max": 0.018 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220ca6df763e9ae2c033cb78bdcea27ad9d8885ff4bede1413576bb66ff28de366b", + "file:size": 1669812, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge3": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/B07.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 3 - 20m", + "eo:bands": [ + { + "name": "B07", + "common_name": "rededge", + "center_wavelength": 0.783, + "full_width_half_max": 0.028 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122050956255cbee8d0801242c316be3c8dca2018b9be3c555a49df3919a991396c4", + "file:size": 1666645, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge1": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/B05.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 1 - 20m", + "eo:bands": [ + { + "name": "B05", + "common_name": "rededge", + "center_wavelength": 0.704, + "full_width_half_max": 0.019 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220a29376f79d03afef311addd6ae68b447fec13192e4c96dc19cb6875bf47abc43", + "file:size": 1670391, + "roles": [ + "data", + "reflectance" + ] + }, + "swir16": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/B11.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 1.6μm - 20m", + "eo:bands": [ + { + "name": "B11", + "common_name": "swir16", + "center_wavelength": 1.61, + "full_width_half_max": 0.143 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220792a6aa646da0a6da7185b9571185672519b6c5619d0184c2c7f33a512cf58fd", + "file:size": 1619071, + "roles": [ + "data", + "reflectance" + ] + }, + "wvp": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/WVP.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Water Vapour (WVP)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "unit": "cm", + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "122096a7ec83f826c2d64ddfffdf0de254ab329380fe27a62a5316684905d3d2fb57", + "file:size": 105171, + "roles": [ + "data" + ] + }, + "nir08": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/B8A.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 2 - 20m", + "eo:bands": [ + { + "name": "B8A", + "common_name": "nir08", + "center_wavelength": 0.865, + "full_width_half_max": 0.033 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12200d824dc3b6cc3b2a25a1f4418439599f9855bf56a45e0743366b5bf79fe6e430", + "file:size": 1673352, + "roles": [ + "data", + "reflectance" + ] + }, + "scl": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/SCL.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Scene classification map (SCL)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "1220b1b8d5243264af9a9a6c20e3c60bbc1cf405376b735d78cf80aa23b526237b68", + "file:size": 81560, + "roles": [ + "data" + ] + }, + "aot": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/AOT.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Aerosol optical thickness (AOT)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "12204f80ce4edb419f177e533dba4ac913145de59cc8e384be1455217e2fed4e25e0", + "file:size": 113778, + "roles": [ + "data" + ] + }, + "coastal": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/B01.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Coastal - 60m", + "eo:bands": [ + { + "name": "B01", + "common_name": "coastal", + "center_wavelength": 0.443, + "full_width_half_max": 0.027 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 399960, 0, -60, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220708a0c79160b88446637eeeee0bb16b66b7553f6472b29b65275dd5c0944ff14", + "file:size": 204521, + "roles": [ + "data", + "reflectance" + ] + }, + "nir09": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/B09.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 3 - 60m", + "eo:bands": [ + { + "name": "B09", + "common_name": "nir09", + "center_wavelength": 0.945, + "full_width_half_max": 0.026 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 399960, 0, -60, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12209c228a60e2ae567cfe920c25213db4ea1d78a90de769f0e8dc1f1d946db2538a", + "file:size": 212847, + "roles": [ + "data", + "reflectance" + ] + }, + "cloud": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/CLD_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Cloud Probabilities", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "1220a3f9f026bd5025f82f3adf0f6c2b419771977a89dd84da2726802de5492beb6f", + "file:size": 90369, + "roles": [ + "data", + "cloud" + ] + }, + "snow": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/SNW_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Snow Probabilities", + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 399960, 0, -20, 5900040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "1220a94513ceac58211fe15df962a282ba81af3b7a6c86448fdd158af7e5a72571d7", + "file:size": 77916, + "roles": [ + "data", + "snow-ice" + ] + }, + "preview": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/L2A_PVI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color preview", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "file:checksum": "1220a918fb0a9713c7662bea5d722e61ea704c8f72df11172c45b6faa37709bf0ba7", + "file:size": 5132, + "roles": [ + "overview" + ] + }, + "granule_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/metadata.xml", + "type": "application/xml", + "file:checksum": "1220688e99c289650d08535e4526f0b30b9f827f967e201a550181d5ba95cbd06706", + "file:size": 101884, + "roles": [ + "metadata" + ] + }, + "tileinfo_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/tileInfo.json", + "type": "application/json", + "file:checksum": "12209891f3af981af6621dedc4fb99d9b1e69346d48bb303bd5dd0d3c3c38798b74d", + "file:size": 1482, + "roles": [ + "metadata" + ] + }, + "product_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/product_metadata.xml", + "type": "application/xml", + "file:checksum": "1220b390185e418500d3d7c46dfd44f6f490cb6771c5c2b11c8ddd909345efa0b488", + "file:size": 54497, + "roles": [ + "metadata" + ] + }, + "thumbnail": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/MU/2025/1/S2B_T29UMU_20250110T120355_L2A/L2A_PVI.jpg", + "type": "image/jpeg", + "title": "Thumbnail of preview image", + "file:checksum": "1220ad716bb6e2c9c8aeec0e64984b21d504b7fcad9c6fb68ee4c7fbd1e86a76ca48", + "file:size": 3202, + "roles": [ + "thumbnail" + ] + } + }, + "bbox": [-10.499044, 52.861187, -10.273422, 53.242816], + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v1.1.0/schema.json", + "https://stac-extensions.github.io/file/v2.1.0/schema.json", + "https://stac-extensions.github.io/grid/v1.1.0/schema.json", + "https://stac-extensions.github.io/mgrs/v1.0.0/schema.json", + "https://stac-extensions.github.io/processing/v1.1.0/schema.json", + "https://stac-extensions.github.io/projection/v1.1.0/schema.json", + "https://stac-extensions.github.io/raster/v1.1.0/schema.json", + "https://stac-extensions.github.io/sentinel-2/v1.0.0/schema.json", + "https://stac-extensions.github.io/storage/v1.0.0/schema.json", + "https://stac-extensions.github.io/view/v1.0.0/schema.json" + ], + "collection": "sentinel-2-c1-l2a" + }, + { + "type": "Feature", + "stac_version": "1.0.0", + "id": "S2B_T28UEG_20250110T120355_L2A", + "properties": { + "created": "2025-01-10T16:24:42.505Z", + "platform": "sentinel-2b", + "constellation": "sentinel-2", + "instruments": [ + "msi" + ], + "eo:cloud_cover": 100, + "proj:epsg": 32628, + "proj:centroid": { + "lat": 55.38441, + "lon": -13.58014 + }, + "mgrs:utm_zone": 28, + "mgrs:latitude_band": "U", + "mgrs:grid_square": "EG", + "grid:code": "MGRS-28UEG", + "view:azimuth": 107.206073425202, + "view:incidence_angle": 10.2725275481198, + "view:sun_azimuth": 166.291906822544, + "view:sun_elevation": 11.6956433007712, + "s2:tile_id": "S2B_OPER_MSI_L2A_TL_2BPS_20250110T154043_A040990_T28UEG_N05.11", + "s2:degraded_msi_data_percentage": 0, + "s2:nodata_pixel_percentage": 65.152574, + "s2:saturated_defective_pixel_percentage": 0, + "s2:cloud_shadow_percentage": 0, + "s2:vegetation_percentage": 0, + "s2:not_vegetated_percentage": 0, + "s2:water_percentage": 0, + "s2:unclassified_percentage": 0, + "s2:medium_proba_clouds_percentage": 1.233756, + "s2:high_proba_clouds_percentage": 98.766243, + "s2:thin_cirrus_percentage": 0, + "s2:snow_ice_percentage": 0, + "s2:product_type": "S2MSI2A", + "s2:processing_baseline": "05.11", + "s2:product_uri": "S2B_MSIL2A_20250110T120359_N0511_R066_T28UEG_20250110T154043.SAFE", + "s2:generation_time": "2025-01-10T15:40:43.000000Z", + "s2:datatake_id": "GS2B_20250110T120359_040990_N05.11", + "s2:datatake_type": "INS-NOBS", + "s2:datastrip_id": "S2B_OPER_MSI_L2A_DS_2BPS_20250110T154043_S20250110T120355_N05.11", + "s2:reflectance_conversion_factor": 1.03425891111326, + "datetime": "2025-01-10T12:06:13.388000Z", + "earthsearch:payload_id": "roda-sentinel-2-c1-l2a/workflow-sentinel-2-c1-l2a-to-stac/3f433637e07c4b2600b19a2e7741e644", + "storage:platform": "AWS", + "storage:region": "us-west-2", + "storage:requester_pays": false, + "processing:software": { + "sentinel-2-c1-l2a-to-stac": "v2024.02.01" + }, + "updated": "2025-01-10T16:24:42.505Z" + }, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [-13.6265322920318, 55.9380706206949], + [-14.1026482225707, 54.9557969721861], + [-13.2860297382675, 54.9470270219076], + [-13.2427407104991, 55.933193717984], + [-13.6265322920318, 55.9380706206949] + ] + ] + }, + "links": [ + { + "rel": "self", + "type": "application/geo+json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T28UEG_20250110T120355_L2A" + }, + { + "rel": "canonical", + "href": "s3://e84-earth-search-sentinel-data/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/S2B_T28UEG_20250110T120355_L2A.json", + "type": "application/json" + }, + { + "rel": "via", + "href": "s3://sentinel-s2-l2a/tiles/28/U/EG/2025/1/10/0/metadata.xml", + "type": "application/xml", + "title": "Granule Metadata in Sinergize RODA Archive" + }, + { + "rel": "parent", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "collection", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "root", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1" + }, + { + "rel": "thumbnail", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T28UEG_20250110T120355_L2A/thumbnail" + } + ], + "assets": { + "red": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/B04.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red - 10m", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220b9d353ff3c5b32a6a6b9d7373a4a23e53977ca4a2a53c331a33f42db253bf1d9", + "file:size": 60381704, + "roles": [ + "data", + "reflectance" + ] + }, + "green": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/B03.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Green - 10m", + "eo:bands": [ + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122069bc8ca3f9e02a5d9cc9ccfaa346b82fbc43d49b5733ef8c0c442cd7c1029e95", + "file:size": 60566679, + "roles": [ + "data", + "reflectance" + ] + }, + "blue": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/B02.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Blue - 10m", + "eo:bands": [ + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220e79da24993985cbba76d10c1921e9e623c42efbe0dfeef64e64541abec1cabc3", + "file:size": 61289399, + "roles": [ + "data", + "reflectance" + ] + }, + "visual": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/TCI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color image", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 6200040], + "file:checksum": "122088b33cf108ade9b4c26653c73e36729cd15ac949ded66e021c33e23c027e5c0a", + "file:size": 2404937, + "roles": [ + "visual" + ] + }, + "nir": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/B08.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 1 - 10m", + "eo:bands": [ + { + "name": "B08", + "common_name": "nir", + "center_wavelength": 0.842, + "full_width_half_max": 0.145 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220669b38f781482a19c416b172d75902376262b840cd11d4f2c0485c0483264f1a", + "file:size": 61404018, + "roles": [ + "data", + "reflectance" + ] + }, + "swir22": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/B12.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 2.2μm - 20m", + "eo:bands": [ + { + "name": "B12", + "common_name": "swir22", + "center_wavelength": 2.19, + "full_width_half_max": 0.242 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122007f0ada73a0dfc1ea5c01b7cef273f3a621e567ff8cac3d230b0f9b98dc7d377", + "file:size": 17601586, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge2": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/B06.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 2 - 20m", + "eo:bands": [ + { + "name": "B06", + "common_name": "rededge", + "center_wavelength": 0.74, + "full_width_half_max": 0.018 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12200d4476aa9543d440f22ef65eb48baaf15d0246f02061e48ef2bf58b0802ef7ca", + "file:size": 19136948, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge3": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/B07.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 3 - 20m", + "eo:bands": [ + { + "name": "B07", + "common_name": "rededge", + "center_wavelength": 0.783, + "full_width_half_max": 0.028 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12202f084ef5aba2af0dd40f435b33d219b893acabbc9e921d86b1a94bf112230669", + "file:size": 19261255, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge1": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/B05.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 1 - 20m", + "eo:bands": [ + { + "name": "B05", + "common_name": "rededge", + "center_wavelength": 0.704, + "full_width_half_max": 0.019 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220969140f9d7e9cc58cf32e457f566f4c340273427ffb8dddb2f683d150835ad6c", + "file:size": 19065317, + "roles": [ + "data", + "reflectance" + ] + }, + "swir16": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/B11.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 1.6μm - 20m", + "eo:bands": [ + { + "name": "B11", + "common_name": "swir16", + "center_wavelength": 1.61, + "full_width_half_max": 0.143 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220914df8e59a4a35b02a50c7a491dc3ca9a1cae1f4ae2d84a7e0255f997be7f78f", + "file:size": 18069457, + "roles": [ + "data", + "reflectance" + ] + }, + "wvp": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/WVP.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Water Vapour (WVP)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "unit": "cm", + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "1220737f0c96ba09ebf59029954c78d1a0a402c1053743ebff2525838d83a1511a38", + "file:size": 95845, + "roles": [ + "data" + ] + }, + "nir08": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/B8A.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 2 - 20m", + "eo:bands": [ + { + "name": "B8A", + "common_name": "nir08", + "center_wavelength": 0.865, + "full_width_half_max": 0.033 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122011dd8b6bdb5f241dd736b2d504b45e34c42749a53368505a15fc6ef2affb2090", + "file:size": 19513705, + "roles": [ + "data", + "reflectance" + ] + }, + "scl": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/SCL.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Scene classification map (SCL)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "1220a5d8882157abc3181bb1a6a46d7c64fc4c9d024b75e6d74fc16ab0e83c904c5c", + "file:size": 121275, + "roles": [ + "data" + ] + }, + "aot": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/AOT.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Aerosol optical thickness (AOT)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "1220c6e200980d8d0b7d7f2e6646d494b790d40a5cbfe0cdc31df7cf1a4576456fa2", + "file:size": 173930, + "roles": [ + "data" + ] + }, + "coastal": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/B01.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Coastal - 60m", + "eo:bands": [ + { + "name": "B01", + "common_name": "coastal", + "center_wavelength": 0.443, + "full_width_half_max": 0.027 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 499980, 0, -60, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122086a4aeabc9e17edd3c8951f13de11c1ed3857bb33fa3cb97e68a9d0100147924", + "file:size": 2249862, + "roles": [ + "data", + "reflectance" + ] + }, + "nir09": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/B09.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 3 - 60m", + "eo:bands": [ + { + "name": "B09", + "common_name": "nir09", + "center_wavelength": 0.945, + "full_width_half_max": 0.026 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 499980, 0, -60, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220199c59b4af3b328f6aeb6d39fc7ceb285748f594cf5c955cd0a8f4942cba3781", + "file:size": 1916080, + "roles": [ + "data", + "reflectance" + ] + }, + "cloud": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/CLD_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Cloud Probabilities", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "122076cefbddfb96e123b42521059ea8dca0c458e3394688c0f886fb43c33d907402", + "file:size": 231288, + "roles": [ + "data", + "cloud" + ] + }, + "snow": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/SNW_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Snow Probabilities", + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "12209a39c5a396b02efa2a98f06213d63ee40227e1d292883e792279df61c431e8ae", + "file:size": 53931, + "roles": [ + "data", + "snow-ice" + ] + }, + "preview": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/L2A_PVI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color preview", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "file:checksum": "122075dc35d66b6d9a941a1277a77f576ba3b7bd5c0a325d60641feefa4812bc1124", + "file:size": 8431, + "roles": [ + "overview" + ] + }, + "granule_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/metadata.xml", + "type": "application/xml", + "file:checksum": "1220c4312c4f5ad1f1c757186808c99ec2a552cf2874bc7740e635da2b3538bbfbfb", + "file:size": 276748, + "roles": [ + "metadata" + ] + }, + "tileinfo_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/tileInfo.json", + "type": "application/json", + "file:checksum": "122071fa608aa4046865213a5eac7e5d35d24204e73c393ef5ddaea518918a91e35f", + "file:size": 1518, + "roles": [ + "metadata" + ] + }, + "product_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/product_metadata.xml", + "type": "application/xml", + "file:checksum": "12205fd1335176b336218ed5c4e5f7930ae15d28a2944d592d89b1ca962b84f9b0ee", + "file:size": 54707, + "roles": [ + "metadata" + ] + }, + "thumbnail": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/U/EG/2025/1/S2B_T28UEG_20250110T120355_L2A/L2A_PVI.jpg", + "type": "image/jpeg", + "title": "Thumbnail of preview image", + "file:checksum": "1220659ac78ac2c72c27ef4b3d2f06306fa35ba128b539ce99a12e601e3b5dba16cd", + "file:size": 4633, + "roles": [ + "thumbnail" + ] + } + }, + "bbox": [-14.102648, 54.947027, -13.242741, 55.938071], + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v1.1.0/schema.json", + "https://stac-extensions.github.io/file/v2.1.0/schema.json", + "https://stac-extensions.github.io/grid/v1.1.0/schema.json", + "https://stac-extensions.github.io/mgrs/v1.0.0/schema.json", + "https://stac-extensions.github.io/processing/v1.1.0/schema.json", + "https://stac-extensions.github.io/projection/v1.1.0/schema.json", + "https://stac-extensions.github.io/raster/v1.1.0/schema.json", + "https://stac-extensions.github.io/sentinel-2/v1.0.0/schema.json", + "https://stac-extensions.github.io/storage/v1.0.0/schema.json", + "https://stac-extensions.github.io/view/v1.0.0/schema.json" + ], + "collection": "sentinel-2-c1-l2a" + }, + { + "type": "Feature", + "stac_version": "1.0.0", + "id": "S2B_T28VEH_20250110T120355_L2A", + "properties": { + "created": "2025-01-10T16:23:30.675Z", + "platform": "sentinel-2b", + "constellation": "sentinel-2", + "instruments": [ + "msi" + ], + "eo:cloud_cover": 99.301517, + "proj:epsg": 32628, + "proj:centroid": { + "lat": 56.15644, + "lon": -13.37486 + }, + "mgrs:utm_zone": 28, + "mgrs:latitude_band": "V", + "mgrs:grid_square": "EH", + "grid:code": "MGRS-28VEH", + "view:azimuth": 111.841329557221, + "view:incidence_angle": 11.1233547024194, + "view:sun_azimuth": 166.296128019016, + "view:sun_elevation": 10.8177386822595, + "s2:tile_id": "S2B_OPER_MSI_L2A_TL_2BPS_20250110T154043_A040990_T28VEH_N05.11", + "s2:degraded_msi_data_percentage": 0, + "s2:nodata_pixel_percentage": 88.483912, + "s2:saturated_defective_pixel_percentage": 0, + "s2:cloud_shadow_percentage": 0, + "s2:vegetation_percentage": 0, + "s2:not_vegetated_percentage": 0, + "s2:water_percentage": 0.698481, + "s2:unclassified_percentage": 0, + "s2:medium_proba_clouds_percentage": 42.502564, + "s2:high_proba_clouds_percentage": 44.748831, + "s2:thin_cirrus_percentage": 12.050124, + "s2:snow_ice_percentage": 0, + "s2:product_type": "S2MSI2A", + "s2:processing_baseline": "05.11", + "s2:product_uri": "S2B_MSIL2A_20250110T120359_N0511_R066_T28VEH_20250110T154043.SAFE", + "s2:generation_time": "2025-01-10T15:40:43.000000Z", + "s2:datatake_id": "GS2B_20250110T120359_040990_N05.11", + "s2:datatake_type": "INS-NOBS", + "s2:datastrip_id": "S2B_OPER_MSI_L2A_DS_2BPS_20250110T154043_S20250110T120355_N05.11", + "s2:reflectance_conversion_factor": 1.03425891111326, + "datetime": "2025-01-10T12:06:00.902000Z", + "earthsearch:payload_id": "roda-sentinel-2-c1-l2a/workflow-sentinel-2-c1-l2a-to-stac/11834873512e6300f7681a47b13878fd", + "storage:platform": "AWS", + "storage:region": "us-west-2", + "storage:requester_pays": false, + "processing:software": { + "sentinel-2-c1-l2a-to-stac": "v2024.02.01" + }, + "updated": "2025-01-10T16:23:30.675Z" + }, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [-13.2483259551958, 55.8448624735234], + [-13.206223668266, 56.7743145359432], + [-13.6700421061974, 55.8501437572837], + [-13.2483259551958, 55.8448624735234] + ] + ] + }, + "links": [ + { + "rel": "self", + "type": "application/geo+json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T28VEH_20250110T120355_L2A" + }, + { + "rel": "canonical", + "href": "s3://e84-earth-search-sentinel-data/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/S2B_T28VEH_20250110T120355_L2A.json", + "type": "application/json" + }, + { + "rel": "via", + "href": "s3://sentinel-s2-l2a/tiles/28/V/EH/2025/1/10/0/metadata.xml", + "type": "application/xml", + "title": "Granule Metadata in Sinergize RODA Archive" + }, + { + "rel": "parent", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "collection", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "root", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1" + }, + { + "rel": "thumbnail", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T28VEH_20250110T120355_L2A/thumbnail" + } + ], + "assets": { + "red": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/B04.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red - 10m", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 6300000], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12209a078d936edd1a7e1bcdf766e8fcf55431b60f65328965b08f0af871af8464eb", + "file:size": 23470320, + "roles": [ + "data", + "reflectance" + ] + }, + "green": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/B03.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Green - 10m", + "eo:bands": [ + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 6300000], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220d798375df0fc4cd2f16f037408fecb24b72f372f1658c733d3a7d522dd3eac07", + "file:size": 23549627, + "roles": [ + "data", + "reflectance" + ] + }, + "blue": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/B02.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Blue - 10m", + "eo:bands": [ + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 6300000], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12207470ebca173e6c56435c6e9a2da0013603294d304b7e2a12803a97029c1269b0", + "file:size": 23446317, + "roles": [ + "data", + "reflectance" + ] + }, + "visual": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/TCI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color image", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 6300000], + "file:checksum": "12207b390449b17d0cebbbd6b2de54203a94e925ac96ce54aa5b34583674c0113e89", + "file:size": 21770593, + "roles": [ + "visual" + ] + }, + "nir": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/B08.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 1 - 10m", + "eo:bands": [ + { + "name": "B08", + "common_name": "nir", + "center_wavelength": 0.842, + "full_width_half_max": 0.145 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 6300000], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12205b740b77c08f48ce13ee908590a25d361e72d47ade71fa055c9d268534e04223", + "file:size": 23641591, + "roles": [ + "data", + "reflectance" + ] + }, + "swir22": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/B12.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 2.2μm - 20m", + "eo:bands": [ + { + "name": "B12", + "common_name": "swir22", + "center_wavelength": 2.19, + "full_width_half_max": 0.242 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6300000], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122085f33db31d5fbed707c061c7df0cc929dcc7b861df99561166b57bd6600e84ab", + "file:size": 6241149, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge2": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/B06.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 2 - 20m", + "eo:bands": [ + { + "name": "B06", + "common_name": "rededge", + "center_wavelength": 0.74, + "full_width_half_max": 0.018 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6300000], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220eb01170f6a76786fd032101911244c117c5ca2a43fb2a8162327246afaccfe09", + "file:size": 6636232, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge3": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/B07.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 3 - 20m", + "eo:bands": [ + { + "name": "B07", + "common_name": "rededge", + "center_wavelength": 0.783, + "full_width_half_max": 0.028 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6300000], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220577312552117447a506afcc312f45302506b7b184048c7da1c7e4c0dd514db9c", + "file:size": 6676498, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge1": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/B05.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 1 - 20m", + "eo:bands": [ + { + "name": "B05", + "common_name": "rededge", + "center_wavelength": 0.704, + "full_width_half_max": 0.019 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6300000], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220c33490a3ec481d86db0f85ecea1d741794cc12424dba366d2f606f8e5b23588a", + "file:size": 6631667, + "roles": [ + "data", + "reflectance" + ] + }, + "swir16": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/B11.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 1.6μm - 20m", + "eo:bands": [ + { + "name": "B11", + "common_name": "swir16", + "center_wavelength": 1.61, + "full_width_half_max": 0.143 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6300000], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122072b1ed47a63d6dc0bf1ac50432268e594262a2d2b451397c0ebbdabbadd88cff", + "file:size": 6317373, + "roles": [ + "data", + "reflectance" + ] + }, + "wvp": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/WVP.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Water Vapour (WVP)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6300000], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "unit": "cm", + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "12209a8982ff57262f14ebd82f32666904914fb550aabe6a6ddf554350fac7c2136d", + "file:size": 95845, + "roles": [ + "data" + ] + }, + "nir08": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/B8A.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 2 - 20m", + "eo:bands": [ + { + "name": "B8A", + "common_name": "nir08", + "center_wavelength": 0.865, + "full_width_half_max": 0.033 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6300000], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220efe682a3aa1a20332f77390e4f53fbcd1b16f32f694fb3b352f8410919e5955b", + "file:size": 6716668, + "roles": [ + "data", + "reflectance" + ] + }, + "scl": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/SCL.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Scene classification map (SCL)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6300000], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "122017b19cbd43088542594dd2b1d2f70313da1a2ecfb5a4e29f20bd1bc3da1f7815", + "file:size": 222714, + "roles": [ + "data" + ] + }, + "aot": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/AOT.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Aerosol optical thickness (AOT)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6300000], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "1220246803d9849a422405163a49896d7106184d4e9ec6691fc49ac83816517f0509", + "file:size": 138722, + "roles": [ + "data" + ] + }, + "coastal": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/B01.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Coastal - 60m", + "eo:bands": [ + { + "name": "B01", + "common_name": "coastal", + "center_wavelength": 0.443, + "full_width_half_max": 0.027 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 499980, 0, -60, 6300000], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122090dcd8bdb5db4777e23fbd9a863b3475caccea2632e6f082c1a40e7c0d7c615f", + "file:size": 801606, + "roles": [ + "data", + "reflectance" + ] + }, + "nir09": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/B09.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 3 - 60m", + "eo:bands": [ + { + "name": "B09", + "common_name": "nir09", + "center_wavelength": 0.945, + "full_width_half_max": 0.026 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 499980, 0, -60, 6300000], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220cb7d707d19dfdd20a6619f08f7f55e7becb8c164c73fc1ba0924ac0f50518bf6", + "file:size": 716309, + "roles": [ + "data", + "reflectance" + ] + }, + "cloud": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/CLD_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Cloud Probabilities", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6300000], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "12201cea1fb6f01083885b59b8c36f635d80e32062c28e214bc65e25e272e2eb73d8", + "file:size": 603026, + "roles": [ + "data", + "cloud" + ] + }, + "snow": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/SNW_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Snow Probabilities", + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6300000], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "12206d5fa18554952bea897b5e8ffe90c64ae6ba98d31744de65c4e269ca0e913bbd", + "file:size": 53931, + "roles": [ + "data", + "snow-ice" + ] + }, + "preview": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/L2A_PVI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color preview", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "file:checksum": "12205adc705f4f9892cbca80a870037a9ab732ec8c93ab91639a7c9477e000d1764a", + "file:size": 31910, + "roles": [ + "overview" + ] + }, + "granule_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/metadata.xml", + "type": "application/xml", + "file:checksum": "1220e635711170fce5974513ea062f6739b962681828dd8d1db23b026072abb6064e", + "file:size": 185550, + "roles": [ + "metadata" + ] + }, + "tileinfo_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/tileInfo.json", + "type": "application/json", + "file:checksum": "1220d7a2e8b29cf19d297e4c685b7cd4a8a82399f0e8d6b063c1d40b9978a734f008", + "file:size": 1510, + "roles": [ + "metadata" + ] + }, + "product_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/product_metadata.xml", + "type": "application/xml", + "file:checksum": "12200362284af62323155e23089028e143ef902c4f19f040a4341fb1ab723056e132", + "file:size": 54677, + "roles": [ + "metadata" + ] + }, + "thumbnail": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/28/V/EH/2025/1/S2B_T28VEH_20250110T120355_L2A/L2A_PVI.jpg", + "type": "image/jpeg", + "title": "Thumbnail of preview image", + "file:checksum": "1220d058912bbb7875f8b023701f44852a6a18f09d9062bb2a0476acb44e06e62ab1", + "file:size": 7597, + "roles": [ + "thumbnail" + ] + } + }, + "bbox": [-13.670042, 55.844862, -13.206224, 56.774315], + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v1.1.0/schema.json", + "https://stac-extensions.github.io/file/v2.1.0/schema.json", + "https://stac-extensions.github.io/grid/v1.1.0/schema.json", + "https://stac-extensions.github.io/mgrs/v1.0.0/schema.json", + "https://stac-extensions.github.io/processing/v1.1.0/schema.json", + "https://stac-extensions.github.io/projection/v1.1.0/schema.json", + "https://stac-extensions.github.io/raster/v1.1.0/schema.json", + "https://stac-extensions.github.io/sentinel-2/v1.0.0/schema.json", + "https://stac-extensions.github.io/storage/v1.0.0/schema.json", + "https://stac-extensions.github.io/view/v1.0.0/schema.json" + ], + "collection": "sentinel-2-c1-l2a" + }, + { + "type": "Feature", + "stac_version": "1.0.0", + "id": "S2B_T29UNB_20250110T120355_L2A", + "properties": { + "created": "2025-01-10T16:22:47.191Z", + "platform": "sentinel-2b", + "constellation": "sentinel-2", + "instruments": [ + "msi" + ], + "eo:cloud_cover": 58.027476, + "proj:epsg": 32629, + "proj:centroid": { + "lat": 55.76895, + "lon": -8.89008 + }, + "mgrs:utm_zone": 29, + "mgrs:latitude_band": "U", + "mgrs:grid_square": "NB", + "grid:code": "MGRS-29UNB", + "view:azimuth": 296.213391314926, + "view:incidence_angle": 11.2885323247034, + "view:sun_azimuth": 171.888242896347, + "view:sun_elevation": 12.3330462371241, + "s2:tile_id": "S2B_OPER_MSI_L2A_TL_2BPS_20250110T154043_A040990_T29UNB_N05.11", + "s2:degraded_msi_data_percentage": 0, + "s2:nodata_pixel_percentage": 94.879043, + "s2:saturated_defective_pixel_percentage": 0, + "s2:cloud_shadow_percentage": 0, + "s2:vegetation_percentage": 0, + "s2:not_vegetated_percentage": 0.000454, + "s2:water_percentage": 41.972074, + "s2:unclassified_percentage": 0, + "s2:medium_proba_clouds_percentage": 19.25049, + "s2:high_proba_clouds_percentage": 1.270002, + "s2:thin_cirrus_percentage": 37.50698, + "s2:snow_ice_percentage": 0, + "s2:product_type": "S2MSI2A", + "s2:processing_baseline": "05.11", + "s2:product_uri": "S2B_MSIL2A_20250110T120359_N0511_R066_T29UNB_20250110T154043.SAFE", + "s2:generation_time": "2025-01-10T15:40:43.000000Z", + "s2:datatake_id": "GS2B_20250110T120359_040990_N05.11", + "s2:datatake_type": "INS-NOBS", + "s2:datastrip_id": "S2B_OPER_MSI_L2A_DS_2BPS_20250110T154043_S20250110T120355_N05.11", + "s2:reflectance_conversion_factor": 1.03425891111326, + "datetime": "2025-01-10T12:05:52.609000Z", + "earthsearch:payload_id": "roda-sentinel-2-c1-l2a/workflow-sentinel-2-c1-l2a-to-stac/b708c4badc2109a13b89da40305c2ef8", + "storage:platform": "AWS", + "storage:region": "us-west-2", + "storage:requester_pays": false, + "processing:software": { + "sentinel-2-c1-l2a-to-stac": "v2024.02.01" + }, + "updated": "2025-01-10T16:22:47.191Z" + }, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [-9.00030421811058, 55.9457254173272], + [-9.00030013314029, 55.4158504628607], + [-8.66964406830726, 55.9452826258813], + [-9.00030421811058, 55.9457254173272] + ] + ] + }, + "links": [ + { + "rel": "self", + "type": "application/geo+json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T29UNB_20250110T120355_L2A" + }, + { + "rel": "canonical", + "href": "s3://e84-earth-search-sentinel-data/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/S2B_T29UNB_20250110T120355_L2A.json", + "type": "application/json" + }, + { + "rel": "via", + "href": "s3://sentinel-s2-l2a/tiles/29/U/NB/2025/1/10/0/metadata.xml", + "type": "application/xml", + "title": "Granule Metadata in Sinergize RODA Archive" + }, + { + "rel": "parent", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "collection", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "root", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1" + }, + { + "rel": "thumbnail", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T29UNB_20250110T120355_L2A/thumbnail" + } + ], + "assets": { + "red": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/B04.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red - 10m", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220e789936a2387f8dc7e6aabb0968bae1a6e9f84523186412e9929641effdaa361", + "file:size": 10027685, + "roles": [ + "data", + "reflectance" + ] + }, + "green": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/B03.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Green - 10m", + "eo:bands": [ + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122063893492f1d7288fc4035b468681f4553f2887f3c74655a4c84aaf6516a37f00", + "file:size": 10190327, + "roles": [ + "data", + "reflectance" + ] + }, + "blue": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/B02.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Blue - 10m", + "eo:bands": [ + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122058f51b3925cec6f1e94cb2e1ba55cac8ae0b1c98fe7df627e0f0843da4b1ecd4", + "file:size": 10515581, + "roles": [ + "data", + "reflectance" + ] + }, + "visual": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/TCI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color image", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 6200040], + "file:checksum": "1220bad96fa28f2414eccd6c5b23831d905d7df22fd3bd24267446eaf3630d3972ea", + "file:size": 14154510, + "roles": [ + "visual" + ] + }, + "nir": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/B08.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 1 - 10m", + "eo:bands": [ + { + "name": "B08", + "common_name": "nir", + "center_wavelength": 0.842, + "full_width_half_max": 0.145 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 499980, 0, -10, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220e5b6a275b9c649ab2288243d5824eeebda0808e51f6a8a40a69903c690b2fbde", + "file:size": 10111553, + "roles": [ + "data", + "reflectance" + ] + }, + "swir22": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/B12.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 2.2μm - 20m", + "eo:bands": [ + { + "name": "B12", + "common_name": "swir22", + "center_wavelength": 2.19, + "full_width_half_max": 0.242 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122026ec8ebcfa8c2d83b76bef0e7b3496ed3c1d39f87fab9e11a8ed939b4c01636c", + "file:size": 2437631, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge2": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/B06.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 2 - 20m", + "eo:bands": [ + { + "name": "B06", + "common_name": "rededge", + "center_wavelength": 0.74, + "full_width_half_max": 0.018 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220dd58b9912f585b50907e6526958402a7283bdf9420db9b9efb8407d6b0d36da1", + "file:size": 2593728, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge3": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/B07.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 3 - 20m", + "eo:bands": [ + { + "name": "B07", + "common_name": "rededge", + "center_wavelength": 0.783, + "full_width_half_max": 0.028 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122009281ca55d8aa3bccfd8fc14d334e3fda9ed2d0083bf9e6e1c9a9799c23ab1eb", + "file:size": 2605950, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge1": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/B05.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 1 - 20m", + "eo:bands": [ + { + "name": "B05", + "common_name": "rededge", + "center_wavelength": 0.704, + "full_width_half_max": 0.019 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220e4b4d1744e36c8113ac5a208d79b122c4d41d6484fe43a060a306c633949d678", + "file:size": 2596301, + "roles": [ + "data", + "reflectance" + ] + }, + "swir16": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/B11.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 1.6μm - 20m", + "eo:bands": [ + { + "name": "B11", + "common_name": "swir16", + "center_wavelength": 1.61, + "full_width_half_max": 0.143 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12203738cfcacf2d49ff3ee15807d2d85fc84b3074196aa3bd7cd5ff4ec5906143b1", + "file:size": 2403501, + "roles": [ + "data", + "reflectance" + ] + }, + "wvp": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/WVP.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Water Vapour (WVP)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "unit": "cm", + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "1220b4f370a63b84c6d603d295d249fcb26089f11d09a85567594f2f12507533daed", + "file:size": 109472, + "roles": [ + "data" + ] + }, + "nir08": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/B8A.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 2 - 20m", + "eo:bands": [ + { + "name": "B8A", + "common_name": "nir08", + "center_wavelength": 0.865, + "full_width_half_max": 0.033 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12208c9a6417984eccfdd754b8dc7d6e5041c475412fa297ce8650d58394cace8190", + "file:size": 2590780, + "roles": [ + "data", + "reflectance" + ] + }, + "scl": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/SCL.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Scene classification map (SCL)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "1220c31980a5d07ba12b22daec798def163f4c63a5e2ff830a44c4ed49091da75d35", + "file:size": 112713, + "roles": [ + "data" + ] + }, + "aot": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/AOT.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Aerosol optical thickness (AOT)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "1220d0ded0020d1500a07a3ceb31efefb0b7e06474d74223f4a30d714bfdfa26faf7", + "file:size": 123669, + "roles": [ + "data" + ] + }, + "coastal": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/B01.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Coastal - 60m", + "eo:bands": [ + { + "name": "B01", + "common_name": "coastal", + "center_wavelength": 0.443, + "full_width_half_max": 0.027 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 499980, 0, -60, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122002f9d96b4938ef7381d908139deffbc922903f7e59cba1c43f746f55dca19b66", + "file:size": 306561, + "roles": [ + "data", + "reflectance" + ] + }, + "nir09": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/B09.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 3 - 60m", + "eo:bands": [ + { + "name": "B09", + "common_name": "nir09", + "center_wavelength": 0.945, + "full_width_half_max": 0.026 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 499980, 0, -60, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220df4b77e33fe71a118c587ddde0e8b1db4a2507dc246c3c7678330e2a40b9acd9", + "file:size": 322261, + "roles": [ + "data", + "reflectance" + ] + }, + "cloud": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/CLD_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Cloud Probabilities", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "1220a3d40b111608fa4ca822a1eaf45d94e7c122e942b8378374484cdd907401f755", + "file:size": 116319, + "roles": [ + "data", + "cloud" + ] + }, + "snow": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/SNW_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Snow Probabilities", + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 499980, 0, -20, 6200040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "1220ad48781d7c7fd26e277f85a5cb6e9c786d4f4231a494a77954790b7c993e4a2a", + "file:size": 53931, + "roles": [ + "data", + "snow-ice" + ] + }, + "preview": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/L2A_PVI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color preview", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "file:checksum": "1220fe64e947665671aa18aaaa003c9aee5c29fe8cc23ecb611255003623f6d33aac", + "file:size": 19984, + "roles": [ + "overview" + ] + }, + "granule_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/metadata.xml", + "type": "application/xml", + "file:checksum": "1220c3421f3b8734d926d99797954bdbb5043e5e2a8f985f84c3a1f8a66cbc0db831", + "file:size": 103367, + "roles": [ + "metadata" + ] + }, + "tileinfo_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/tileInfo.json", + "type": "application/json", + "file:checksum": "12208512e1fe6787eaaad288dae43ad3c0c5cc17ce96eca14a5a7038c7b592215753", + "file:size": 1483, + "roles": [ + "metadata" + ] + }, + "product_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/product_metadata.xml", + "type": "application/xml", + "file:checksum": "1220e04a251e328f357ce7c5c94372db4eb36ade7254870d997547795ed3087b7c3d", + "file:size": 54565, + "roles": [ + "metadata" + ] + }, + "thumbnail": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/U/NB/2025/1/S2B_T29UNB_20250110T120355_L2A/L2A_PVI.jpg", + "type": "image/jpeg", + "title": "Thumbnail of preview image", + "file:checksum": "1220df8bda6af78d169612f7fb7dff03dea50a780a19c124b033df1ed036db158db4", + "file:size": 4219, + "roles": [ + "thumbnail" + ] + } + }, + "bbox": [-9.000304, 55.41585, -8.669644, 55.945725], + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v1.1.0/schema.json", + "https://stac-extensions.github.io/file/v2.1.0/schema.json", + "https://stac-extensions.github.io/grid/v1.1.0/schema.json", + "https://stac-extensions.github.io/mgrs/v1.0.0/schema.json", + "https://stac-extensions.github.io/processing/v1.1.0/schema.json", + "https://stac-extensions.github.io/projection/v1.1.0/schema.json", + "https://stac-extensions.github.io/raster/v1.1.0/schema.json", + "https://stac-extensions.github.io/sentinel-2/v1.0.0/schema.json", + "https://stac-extensions.github.io/storage/v1.0.0/schema.json", + "https://stac-extensions.github.io/view/v1.0.0/schema.json" + ], + "collection": "sentinel-2-c1-l2a" + }, + { + "type": "Feature", + "stac_version": "1.0.0", + "id": "S2B_T29VPE_20250110T120355_L2A", + "properties": { + "created": "2025-01-10T16:23:15.779Z", + "platform": "sentinel-2b", + "constellation": "sentinel-2", + "instruments": [ + "msi" + ], + "eo:cloud_cover": 11.435952, + "proj:epsg": 32629, + "proj:centroid": { + "lat": 58.41245, + "lon": -7.14589 + }, + "mgrs:utm_zone": 29, + "mgrs:latitude_band": "V", + "mgrs:grid_square": "PE", + "grid:code": "MGRS-29VPE", + "view:azimuth": 297.497570529651, + "view:incidence_angle": 11.1788806985056, + "view:sun_azimuth": 173.432089910917, + "view:sun_elevation": 9.7935341997534, + "s2:tile_id": "S2B_OPER_MSI_L2A_TL_2BPS_20250110T154043_A040990_T29VPE_N05.11", + "s2:degraded_msi_data_percentage": 0, + "s2:nodata_pixel_percentage": 92.532736, + "s2:saturated_defective_pixel_percentage": 0, + "s2:cloud_shadow_percentage": 0, + "s2:vegetation_percentage": 0, + "s2:not_vegetated_percentage": 0.000044, + "s2:water_percentage": 88.564003, + "s2:unclassified_percentage": 0, + "s2:medium_proba_clouds_percentage": 1.015978, + "s2:high_proba_clouds_percentage": 0.477642, + "s2:thin_cirrus_percentage": 9.942332, + "s2:snow_ice_percentage": 0, + "s2:product_type": "S2MSI2A", + "s2:processing_baseline": "05.11", + "s2:product_uri": "S2B_MSIL2A_20250110T120359_N0511_R066_T29VPE_20250110T154043.SAFE", + "s2:generation_time": "2025-01-10T15:40:43.000000Z", + "s2:datatake_id": "GS2B_20250110T120359_040990_N05.11", + "s2:datatake_type": "INS-NOBS", + "s2:datastrip_id": "S2B_OPER_MSI_L2A_DS_2BPS_20250110T154043_S20250110T120355_N05.11", + "s2:reflectance_conversion_factor": 1.03425891111326, + "datetime": "2025-01-10T12:05:06.060000Z", + "earthsearch:payload_id": "roda-sentinel-2-c1-l2a/workflow-sentinel-2-c1-l2a-to-stac/e2bcd39fde0f241cb4b72e32d0fab57c", + "storage:platform": "AWS", + "storage:region": "us-west-2", + "storage:requester_pays": false, + "processing:software": { + "sentinel-2-c1-l2a-to-stac": "v2024.02.01" + }, + "updated": "2025-01-10T16:23:15.779Z" + }, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [-7.27777230636626, 58.629120925364], + [-7.30877351638107, 57.985540520195], + [-6.85111741280027, 58.6227000712756], + [-7.27777230636626, 58.629120925364] + ] + ] + }, + "links": [ + { + "rel": "self", + "type": "application/geo+json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T29VPE_20250110T120355_L2A" + }, + { + "rel": "canonical", + "href": "s3://e84-earth-search-sentinel-data/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/S2B_T29VPE_20250110T120355_L2A.json", + "type": "application/json" + }, + { + "rel": "via", + "href": "s3://sentinel-s2-l2a/tiles/29/V/PE/2025/1/10/0/metadata.xml", + "type": "application/xml", + "title": "Granule Metadata in Sinergize RODA Archive" + }, + { + "rel": "parent", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "collection", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + }, + { + "rel": "root", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1" + }, + { + "rel": "thumbnail", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items/S2B_T29VPE_20250110T120355_L2A/thumbnail" + } + ], + "assets": { + "red": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/B04.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red - 10m", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 600000, 0, -10, 6500040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220afcf1c0cb6d8ba2a05279e4a67c5c3e15915f1840af95899154c282e389af636", + "file:size": 13978420, + "roles": [ + "data", + "reflectance" + ] + }, + "green": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/B03.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Green - 10m", + "eo:bands": [ + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 600000, 0, -10, 6500040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122089e64eb2d9546aff1732cd91675186f1d02cc709b24f1541e2be3c1890aca965", + "file:size": 14234068, + "roles": [ + "data", + "reflectance" + ] + }, + "blue": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/B02.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Blue - 10m", + "eo:bands": [ + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 600000, 0, -10, 6500040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220cf37736e3df661528f6bbe9ca0a835cf8d2781c2549e65057fcfaa47b6c07c8b", + "file:size": 13792914, + "roles": [ + "data", + "reflectance" + ] + }, + "visual": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/TCI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color image", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + }, + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 10 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 600000, 0, -10, 6500040], + "file:checksum": "1220e1a8602049d4fb2d7e3682a0510a08b0b15e43f0ce758de8d594011941a7f255", + "file:size": 20039038, + "roles": [ + "visual" + ] + }, + "nir": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/B08.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 1 - 10m", + "eo:bands": [ + { + "name": "B08", + "common_name": "nir", + "center_wavelength": 0.842, + "full_width_half_max": 0.145 + } + ], + "gsd": 10, + "proj:shape": [10980, 10980], + "proj:transform": [10, 0, 600000, 0, -10, 6500040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 10, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220e00f1a4bc6109c2c0d887962fd3f7cd4b0a83739f1ebba147e40de73ba6e541c", + "file:size": 14086891, + "roles": [ + "data", + "reflectance" + ] + }, + "swir22": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/B12.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 2.2μm - 20m", + "eo:bands": [ + { + "name": "B12", + "common_name": "swir22", + "center_wavelength": 2.19, + "full_width_half_max": 0.242 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 6500040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220ad59be9557ef198922b6fb613b6ed28f469a57a2719868cd6b498cb7a9119a36", + "file:size": 3440114, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge2": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/B06.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 2 - 20m", + "eo:bands": [ + { + "name": "B06", + "common_name": "rededge", + "center_wavelength": 0.74, + "full_width_half_max": 0.018 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 6500040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "122015fe200e3e96f760b07dd5a085148560c4822f9a9907784f397e1a8f9c7840f9", + "file:size": 3555878, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge3": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/B07.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 3 - 20m", + "eo:bands": [ + { + "name": "B07", + "common_name": "rededge", + "center_wavelength": 0.783, + "full_width_half_max": 0.028 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 6500040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220e5beec733f2675a692cdb0a1c4029712965030cfac0afa9ef2a9cb23184210f8", + "file:size": 3620180, + "roles": [ + "data", + "reflectance" + ] + }, + "rededge1": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/B05.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Red Edge 1 - 20m", + "eo:bands": [ + { + "name": "B05", + "common_name": "rededge", + "center_wavelength": 0.704, + "full_width_half_max": 0.019 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 6500040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12201da5a52ba45a38f0e61f58490bfa7a9e650d5afa08b7f8350850715edb74a35b", + "file:size": 3561439, + "roles": [ + "data", + "reflectance" + ] + }, + "swir16": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/B11.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "SWIR 1.6μm - 20m", + "eo:bands": [ + { + "name": "B11", + "common_name": "swir16", + "center_wavelength": 1.61, + "full_width_half_max": 0.143 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 6500040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12203f8f3d0c9672452094c2de1bc2d045b16216074ef10c8c50f2de4cf85fd62f20", + "file:size": 3339112, + "roles": [ + "data", + "reflectance" + ] + }, + "wvp": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/WVP.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Water Vapour (WVP)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 6500040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "unit": "cm", + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "1220cfc4488c1c1f04f077f62aab5a1f1aed5c8246e32e31c15bffbcdf8a575aaefe", + "file:size": 113489, + "roles": [ + "data" + ] + }, + "nir08": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/B8A.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 2 - 20m", + "eo:bands": [ + { + "name": "B8A", + "common_name": "nir08", + "center_wavelength": 0.865, + "full_width_half_max": 0.033 + } + ], + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 6500040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12207fa89e2f2e5509cb3bb46a5798bf3d10bfc77e0db99b4099705e7ba414a673d4", + "file:size": 3571061, + "roles": [ + "data", + "reflectance" + ] + }, + "scl": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/SCL.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Scene classification map (SCL)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 6500040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "12207e6a0165b890c165ff479833371ffd58e090f76ab592ad6eee4315671ae49481", + "file:size": 134774, + "roles": [ + "data" + ] + }, + "aot": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/AOT.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Aerosol optical thickness (AOT)", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 6500040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 20, + "scale": 0.001, + "offset": 0 + } + ], + "file:checksum": "12203c6ac06eb7a3d9d51ec7681002b16eae54e57856d7b28f14770cbf31fb882782", + "file:size": 118913, + "roles": [ + "data" + ] + }, + "coastal": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/B01.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Coastal - 60m", + "eo:bands": [ + { + "name": "B01", + "common_name": "coastal", + "center_wavelength": 0.443, + "full_width_half_max": 0.027 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 600000, 0, -60, 6500040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "12204dcef553ab0c337169780220b31ec9b1a24d778438cc90c9eae23a08dc7a1fb3", + "file:size": 371951, + "roles": [ + "data", + "reflectance" + ] + }, + "nir09": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/B09.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "NIR 3 - 60m", + "eo:bands": [ + { + "name": "B09", + "common_name": "nir09", + "center_wavelength": 0.945, + "full_width_half_max": 0.026 + } + ], + "gsd": 60, + "proj:shape": [1830, 1830], + "proj:transform": [60, 0, 600000, 0, -60, 6500040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint16", + "spatial_resolution": 60, + "scale": 0.0001, + "offset": -0.1 + } + ], + "file:checksum": "1220fd8f1fb2de99745bc4fbda52099ee12f21cc147f02b6ff946c6422a88bf0fedd", + "file:size": 360501, + "roles": [ + "data", + "reflectance" + ] + }, + "cloud": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/CLD_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Cloud Probabilities", + "gsd": 20, + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 6500040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "122054a47012552753b4e55a873394377da8c2e1264c613cdb9fcf92e8a4dfc7fbc7", + "file:size": 89971, + "roles": [ + "data", + "cloud" + ] + }, + "snow": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/SNW_20m.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "Snow Probabilities", + "proj:shape": [5490, 5490], + "proj:transform": [20, 0, 600000, 0, -20, 6500040], + "raster:bands": [ + { + "nodata": 0, + "data_type": "uint8", + "spatial_resolution": 20 + } + ], + "file:checksum": "1220bd2dd2207c60d18109116b1a1725ac526fd6101b1ce16e8c352b0c58b0d89b22", + "file:size": 53931, + "roles": [ + "data", + "snow-ice" + ] + }, + "preview": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/L2A_PVI.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "True color preview", + "eo:bands": [ + { + "name": "B04", + "common_name": "red", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B03", + "common_name": "green", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B02", + "common_name": "blue", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + } + ], + "file:checksum": "122065d3085676567c18174d812abb82f2823a98b2442d3013b3a2ab05af3ae5d892", + "file:size": 23139, + "roles": [ + "overview" + ] + }, + "granule_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/metadata.xml", + "type": "application/xml", + "file:checksum": "1220c057f7cfd0855e31ead555fe7f3adc705757a2ed7ba69b6ec90f392c6d9afb77", + "file:size": 104646, + "roles": [ + "metadata" + ] + }, + "tileinfo_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/tileInfo.json", + "type": "application/json", + "file:checksum": "122085c2343946f211d76226554f92b456be62783d29a427f4e9f81fd47d9baa492e", + "file:size": 1481, + "roles": [ + "metadata" + ] + }, + "product_metadata": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/product_metadata.xml", + "type": "application/xml", + "file:checksum": "1220ec3afeb78300267b27f5a3707163883d208fc20b23bb0feacc85f1ad69b9a5d3", + "file:size": 54561, + "roles": [ + "metadata" + ] + }, + "thumbnail": { + "href": "https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/29/V/PE/2025/1/S2B_T29VPE_20250110T120355_L2A/L2A_PVI.jpg", + "type": "image/jpeg", + "title": "Thumbnail of preview image", + "file:checksum": "1220e2bff46670175ca4680698e6f65659767c331ed35c51865fdc46267bd4096960", + "file:size": 3710, + "roles": [ + "thumbnail" + ] + } + }, + "bbox": [-7.308774, 57.985541, -6.851117, 58.629121], + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v1.1.0/schema.json", + "https://stac-extensions.github.io/file/v2.1.0/schema.json", + "https://stac-extensions.github.io/grid/v1.1.0/schema.json", + "https://stac-extensions.github.io/mgrs/v1.0.0/schema.json", + "https://stac-extensions.github.io/processing/v1.1.0/schema.json", + "https://stac-extensions.github.io/projection/v1.1.0/schema.json", + "https://stac-extensions.github.io/raster/v1.1.0/schema.json", + "https://stac-extensions.github.io/sentinel-2/v1.0.0/schema.json", + "https://stac-extensions.github.io/storage/v1.0.0/schema.json", + "https://stac-extensions.github.io/view/v1.0.0/schema.json" + ], + "collection": "sentinel-2-c1-l2a" + } + ], + "links": [ + { + "rel": "next", + "title": "Next page of Items", + "method": "GET", + "type": "application/geo+json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items?collections=sentinel-2-c1-l2a&next=2025-01-10T12%3A05%3A06.060000Z%2CS2B_T29VPE_20250110T120355_L2A%2Csentinel-2-c1-l2a" + }, + { + "rel": "root", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1" + }, + { + "rel": "self", + "type": "application/geo+json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a/items" + }, + { + "rel": "collection", + "type": "application/json", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-c1-l2a" + } + ] +} diff --git a/spark/common/src/test/resources/datasource_stac/collection.json b/spark/common/src/test/resources/datasource_stac/collection.json new file mode 100644 index 0000000000..4fbe98d46b --- /dev/null +++ b/spark/common/src/test/resources/datasource_stac/collection.json @@ -0,0 +1,142 @@ +{ + "id": "simple-collection", + "type": "Collection", + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v2.0.0/schema.json", + "https://stac-extensions.github.io/projection/v2.0.0/schema.json", + "https://stac-extensions.github.io/view/v1.0.0/schema.json" + ], + "stac_version": "1.1.0", + "description": "A simple collection demonstrating core catalog fields with links to a couple of items", + "title": "Simple Example Collection", + "keywords": [ + "simple", + "example", + "collection" + ], + "providers": [ + { + "name": "Remote Data, Inc", + "description": "Producers of awesome spatiotemporal assets", + "roles": [ + "producer", + "processor" + ], + "url": "http://remotedata.io" + } + ], + "extent": { + "spatial": { + "bbox": [ + [ + 172.91173669923782, + 1.3438851951615003, + 172.95469614953714, + 1.3690476620161975 + ] + ] + }, + "temporal": { + "interval": [ + [ + "2020-12-11T22:38:32.125Z", + "2020-12-14T18:02:31.437Z" + ] + ] + } + }, + "license": "CC-BY-4.0", + "summaries": { + "platform": [ + "cool_sat1", + "cool_sat2" + ], + "constellation": [ + "ion" + ], + "instruments": [ + "cool_sensor_v1", + "cool_sensor_v2" + ], + "gsd": { + "minimum": 0.512, + "maximum": 0.66 + }, + "eo:cloud_cover": { + "minimum": 1.2, + "maximum": 1.2 + }, + "proj:cpde": [ + "EPSG:32659" + ], + "view:sun_elevation": { + "minimum": 54.9, + "maximum": 54.9 + }, + "view:off_nadir": { + "minimum": 3.8, + "maximum": 3.8 + }, + "view:sun_azimuth": { + "minimum": 135.7, + "maximum": 135.7 + }, + "statistics": { + "type": "object", + "properties": { + "vegetation": { + "description": "Percentage of pixels that are detected as vegetation, e.g. forests, grasslands, etc.", + "minimum": 0, + "maximum": 100 + }, + "water": { + "description": "Percentage of pixels that are detected as water, e.g. rivers, oceans and ponds.", + "minimum": 0, + "maximum": 100 + }, + "urban": { + "description": "Percentage of pixels that detected as urban, e.g. roads and buildings.", + "minimum": 0, + "maximum": 100 + } + } + } + }, + "links": [ + { + "rel": "root", + "href": "./collection.json", + "type": "application/json", + "title": "Simple Example Collection" + }, + { + "rel": "item", + "href": "./simple-item.json", + "type": "application/geo+json", + "title": "Simple Item" + }, + { + "rel": "item", + "href": "./core-item.json", + "type": "application/geo+json", + "title": "Core Item" + }, + { + "rel": "item", + "href": "./extended-item.json", + "type": "application/geo+json", + "title": "Extended Item" + }, + { + "rel": "self", + "href": "https://raw.githubusercontent.com/radiantearth/stac-spec/v1.1.0/examples/collection.json", + "type": "application/json" + }, + { + "rel": "child", + "href": "./nested/nested-collection.json", + "type": "application/json", + "title": "Nested Collection" + } + ] +} diff --git a/spark/common/src/test/resources/datasource_stac/core-item.json b/spark/common/src/test/resources/datasource_stac/core-item.json new file mode 100644 index 0000000000..e151b1353c --- /dev/null +++ b/spark/common/src/test/resources/datasource_stac/core-item.json @@ -0,0 +1,125 @@ +{ + "stac_version": "1.1.0", + "stac_extensions": [], + "type": "Feature", + "id": "20201211_223832_CS2", + "bbox": [ + 172.91173669923782, + 1.3438851951615003, + 172.95469614953714, + 1.3690476620161975 + ], + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + 172.91173669923782, + 1.3438851951615003 + ], + [ + 172.95469614953714, + 1.3438851951615003 + ], + [ + 172.95469614953714, + 1.3690476620161975 + ], + [ + 172.91173669923782, + 1.3690476620161975 + ], + [ + 172.91173669923782, + 1.3438851951615003 + ] + ] + ] + }, + "properties": { + "title": "Core Item", + "description": "A sample STAC Item that includes examples of all common metadata", + "datetime": null, + "start_datetime": "2020-12-11T22:38:32.125Z", + "end_datetime": "2020-12-11T22:38:32.327Z", + "created": "2020-12-12T01:48:13.725Z", + "updated": "2020-12-12T01:48:13.725Z", + "platform": "cool_sat1", + "instruments": [ + "cool_sensor_v1" + ], + "constellation": "ion", + "mission": "collection 5624", + "gsd": 0.512 + }, + "collection": "simple-collection", + "links": [ + { + "rel": "collection", + "href": "./collection.json", + "type": "application/json", + "title": "Simple Example Collection" + }, + { + "rel": "root", + "href": "./collection.json", + "type": "application/json", + "title": "Simple Example Collection" + }, + { + "rel": "parent", + "href": "./collection.json", + "type": "application/json", + "title": "Simple Example Collection" + }, + { + "rel": "alternate", + "type": "text/html", + "href": "http://remotedata.io/catalog/20201211_223832_CS2/index.html", + "title": "HTML version of this STAC Item" + } + ], + "assets": { + "analytic": { + "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2_analytic.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "4-Band Analytic", + "roles": [ + "data" + ] + }, + "thumbnail": { + "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2.jpg", + "title": "Thumbnail", + "type": "image/png", + "roles": [ + "thumbnail" + ] + }, + "visual": { + "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "3-Band Visual", + "roles": [ + "visual" + ] + }, + "udm": { + "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2_analytic_udm.tif", + "title": "Unusable Data Mask", + "type": "image/tiff; application=geotiff" + }, + "json-metadata": { + "href": "http://remotedata.io/catalog/20201211_223832_CS2/extended-metadata.json", + "title": "Extended Metadata", + "type": "application/json", + "roles": [ + "metadata" + ] + }, + "ephemeris": { + "href": "http://cool-sat.com/catalog/20201211_223832_CS2/20201211_223832_CS2.EPH", + "title": "Satellite Ephemeris Metadata" + } + } +} diff --git a/spark/common/src/test/resources/datasource_stac/extended-item.json b/spark/common/src/test/resources/datasource_stac/extended-item.json new file mode 100644 index 0000000000..b5f3a0a9df --- /dev/null +++ b/spark/common/src/test/resources/datasource_stac/extended-item.json @@ -0,0 +1,210 @@ +{ + "stac_version": "1.1.0", + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v2.0.0/schema.json", + "https://stac-extensions.github.io/projection/v2.0.0/schema.json", + "https://stac-extensions.github.io/scientific/v1.0.0/schema.json", + "https://stac-extensions.github.io/view/v1.0.0/schema.json", + "https://stac-extensions.github.io/remote-data/v1.0.0/schema.json" + ], + "type": "Feature", + "id": "20201211_223832_CS2", + "bbox": [ + 172.91173669923782, + 1.3438851951615003, + 172.95469614953714, + 1.3690476620161975 + ], + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + 172.91173669923782, + 1.3438851951615003 + ], + [ + 172.95469614953714, + 1.3438851951615003 + ], + [ + 172.95469614953714, + 1.3690476620161975 + ], + [ + 172.91173669923782, + 1.3690476620161975 + ], + [ + 172.91173669923782, + 1.3438851951615003 + ] + ] + ] + }, + "properties": { + "title": "Extended Item", + "description": "A sample STAC Item that includes a variety of examples from the stable extensions", + "keywords": [ + "extended", + "example", + "item" + ], + "datetime": "2020-12-14T18:02:31.437000Z", + "created": "2020-12-15T01:48:13.725Z", + "updated": "2020-12-15T01:48:13.725Z", + "platform": "cool_sat2", + "instruments": [ + "cool_sensor_v2" + ], + "gsd": 0.66, + "eo:cloud_cover": 1.2, + "eo:snow_cover": 0, + "statistics": { + "vegetation": 12.57, + "water": 1.23, + "urban": 26.2 + }, + "proj:code": "EPSG:32659", + "proj:shape": [ + 5558, + 9559 + ], + "proj:transform": [ + 0.5, + 0, + 712710, + 0, + -0.5, + 151406, + 0, + 0, + 1 + ], + "view:sun_elevation": 54.9, + "view:off_nadir": 3.8, + "view:sun_azimuth": 135.7, + "rd:type": "scene", + "rd:anomalous_pixels": 0.14, + "rd:earth_sun_distance": 1.014156, + "rd:sat_id": "cool_sat2", + "rd:product_level": "LV3A", + "sci:doi": "10.5061/dryad.s2v81.2/27.2" + }, + "collection": "simple-collection", + "links": [ + { + "rel": "collection", + "href": "./collection.json", + "type": "application/json", + "title": "Simple Example Collection" + }, + { + "rel": "root", + "href": "./collection.json", + "type": "application/json", + "title": "Simple Example Collection" + }, + { + "rel": "parent", + "href": "./collection.json", + "type": "application/json", + "title": "Simple Example Collection" + }, + { + "rel": "alternate", + "type": "text/html", + "href": "http://remotedata.io/catalog/20201211_223832_CS2/index.html", + "title": "HTML version of this STAC Item" + } + ], + "assets": { + "analytic": { + "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2_analytic.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "4-Band Analytic", + "roles": [ + "data" + ], + "bands": [ + { + "name": "band1", + "eo:common_name": "blue", + "eo:center_wavelength": 0.47, + "eo:full_width_half_max": 70 + }, + { + "name": "band2", + "eo:common_name": "green", + "eo:center_wavelength": 0.56, + "eo:full_width_half_max": 80 + }, + { + "name": "band3", + "eo:common_name": "red", + "eo:center_wavelength": 0.645, + "eo:full_width_half_max": 90 + }, + { + "name": "band4", + "eo:common_name": "nir", + "eo:center_wavelength": 0.8, + "eo:full_width_half_max": 152 + } + ] + }, + "thumbnail": { + "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2.jpg", + "title": "Thumbnail", + "type": "image/png", + "roles": [ + "thumbnail" + ] + }, + "visual": { + "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "3-Band Visual", + "roles": [ + "visual" + ], + "bands": [ + { + "name": "band3", + "eo:common_name": "red", + "eo:center_wavelength": 0.645, + "eo:full_width_half_max": 90 + }, + { + "name": "band2", + "eo:common_name": "green", + "eo:center_wavelength": 0.56, + "eo:full_width_half_max": 80 + }, + { + "name": "band1", + "eo:common_name": "blue", + "eo:center_wavelength": 0.47, + "eo:full_width_half_max": 70 + } + ] + }, + "udm": { + "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2_analytic_udm.tif", + "title": "Unusable Data Mask", + "type": "image/tiff; application=geotiff" + }, + "json-metadata": { + "href": "http://remotedata.io/catalog/20201211_223832_CS2/extended-metadata.json", + "title": "Extended Metadata", + "type": "application/json", + "roles": [ + "metadata" + ] + }, + "ephemeris": { + "href": "http://cool-sat.com/catalog/20201211_223832_CS2/20201211_223832_CS2.EPH", + "title": "Satellite Ephemeris Metadata" + } + } +} \ No newline at end of file diff --git a/spark/common/src/test/resources/datasource_stac/nested/nested-collection.json b/spark/common/src/test/resources/datasource_stac/nested/nested-collection.json new file mode 100644 index 0000000000..eac6a0ea84 --- /dev/null +++ b/spark/common/src/test/resources/datasource_stac/nested/nested-collection.json @@ -0,0 +1,130 @@ +{ + "id": "nested-collection", + "type": "Collection", + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v2.0.0/schema.json", + "https://stac-extensions.github.io/projection/v2.0.0/schema.json", + "https://stac-extensions.github.io/view/v1.0.0/schema.json" + ], + "stac_version": "1.1.0", + "description": "A nested collection demonstrating core catalog fields with links to an item and items", + "title": "Nested Example Collection", + "keywords": [ + "nested", + "example", + "collection" + ], + "providers": [ + { + "name": "Remote Data, Inc", + "description": "Producers of awesome spatiotemporal assets", + "roles": [ + "producer", + "processor" + ], + "url": "http://remotedata.io" + } + ], + "extent": { + "spatial": { + "bbox": [ + [ + 17.91173669923782, + 10.3438851951615003, + 17.95469614953714, + 10.3690476620161975 + ] + ] + }, + "temporal": { + "interval": [ + [ + "2020-12-11T22:38:32.125Z", + "2020-12-14T18:02:31.437Z" + ] + ] + } + }, + "license": "CC-BY-4.0", + "summaries": { + "platform": [ + "cool_sat1", + "cool_sat2" + ], + "constellation": [ + "ion" + ], + "instruments": [ + "cool_sensor_v1", + "cool_sensor_v2" + ], + "gsd": { + "minimum": 0.512, + "maximum": 0.66 + }, + "eo:cloud_cover": { + "minimum": 1.2, + "maximum": 1.2 + }, + "proj:cpde": [ + "EPSG:32659" + ], + "view:sun_elevation": { + "minimum": 54.9, + "maximum": 54.9 + }, + "view:off_nadir": { + "minimum": 3.8, + "maximum": 3.8 + }, + "view:sun_azimuth": { + "minimum": 135.7, + "maximum": 135.7 + }, + "statistics": { + "type": "object", + "properties": { + "vegetation": { + "description": "Percentage of pixels that are detected as vegetation, e.g. forests, grasslands, etc.", + "minimum": 0, + "maximum": 100 + }, + "water": { + "description": "Percentage of pixels that are detected as water, e.g. rivers, oceans and ponds.", + "minimum": 0, + "maximum": 100 + }, + "urban": { + "description": "Percentage of pixels that detected as urban, e.g. roads and buildings.", + "minimum": 0, + "maximum": 100 + } + } + } + }, + "links": [ + { + "rel": "root", + "href": "./nested-collection.json", + "type": "application/json", + "title": "Nested Example Collection" + }, + { + "rel": "item", + "href": "./nested-item.json", + "type": "application/geo+json", + "title": "Nested Item" + }, + { + "rel": "items", + "href": "./nested-items.json", + "type": "application/geo+json", + "title": "Nested Items" + }, + { + "rel": "self", + "href": "https://raw.githubusercontent.com/radiantearth/stac-spec/v1.1.0/examples/nested-collection.json", + "type": "application/json" + } + ] +} diff --git a/spark/common/src/test/resources/datasource_stac/nested/nested-item.json b/spark/common/src/test/resources/datasource_stac/nested/nested-item.json new file mode 100644 index 0000000000..1a26b4f400 --- /dev/null +++ b/spark/common/src/test/resources/datasource_stac/nested/nested-item.json @@ -0,0 +1,55 @@ +{ + "id": "nested-item", + "type": "Feature", + "stac_version": "1.1.0", + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v2.0.0/schema.json", + "https://stac-extensions.github.io/projection/v2.0.0/schema.json", + "https://stac-extensions.github.io/view/v1.0.0/schema.json" + ], + "bbox": [ + 17.91173669923782, + 10.3438851951615003, + 17.95469614953714, + 10.3690476620161975 + ], + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [17.91173669923782, 10.3438851951615003], + [17.95469614953714, 10.3438851951615003], + [17.95469614953714, 10.3690476620161975], + [17.91173669923782, 10.3690476620161975], + [17.91173669923782, 10.3438851951615003] + ] + ] + }, + "properties": { + "title": "Nested Item", + "description": "A sample STAC nested Item that includes examples of all common metadata", + "datetime": "2020-12-12T00:00:00Z", + "eo:cloud_cover": 1.2, + "proj:epsg": 32659, + "view:sun_elevation": 54.9, + "view:off_nadir": 3.8, + "view:sun_azimuth": 135.7 + }, + "assets": { + "visual": { + "href": "https://e84-earth-search-sentinel-data.s3/example/visual.tif", + "title": "Visual asset", + "type": "image/tiff; application=geotiff", + "roles": ["visual"] + } + }, + "links": [ + { + "rel": "collection", + "href": "./nested-collection.json", + "type": "application/json", + "title": "Nested Example Collection" + } + ] +} + diff --git a/spark/common/src/test/resources/datasource_stac/nested/nested-items.json b/spark/common/src/test/resources/datasource_stac/nested/nested-items.json new file mode 100644 index 0000000000..1c05033775 --- /dev/null +++ b/spark/common/src/test/resources/datasource_stac/nested/nested-items.json @@ -0,0 +1,110 @@ +{ + "type": "FeatureCollection", + "stac_version": "1.1.0", + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v2.0.0/schema.json", + "https://stac-extensions.github.io/projection/v2.0.0/schema.json", + "https://stac-extensions.github.io/view/v1.0.0/schema.json" + ], + "type": "FeatureCollection", + "features": [ + { + "stac_version": "1.1.0", + "id": "nested-item-1", + "type": "Feature", + "bbox": [ + 17.91173669923782, + 10.3438851951615003, + 17.95469614953714, + 10.3690476620161975 + ], + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [17.91173669923782, 10.3438851951615003], + [17.95469614953714, 10.3438851951615003], + [17.95469614953714, 10.3690476620161975], + [17.91173669923782, 10.3690476620161975], + [17.91173669923782, 10.3438851951615003] + ] + ] + }, + "properties": { + "title": "Nested Item 1", + "description": "A sample STAC nested Item that includes examples of all common metadata", + "datetime": "2020-12-12T00:00:00Z", + "eo:cloud_cover": 1.2, + "proj:epsg": 32659, + "view:sun_elevation": 54.9, + "view:off_nadir": 3.8, + "view:sun_azimuth": 135.7 + }, + "assets": { + "visual": { + "href": "http://e84-earth-search-sentinel-data.s3/example/visual1.tif", + "title": "Visual asset 1", + "type": "image/tiff; application=geotiff", + "roles": ["visual"] + } + }, + "links": [ + { + "rel": "collection", + "href": "./nested-collection.json", + "type": "application/json", + "title": "Nested Example Collection" + } + ] + }, + { + "stac_version": "1.1.0", + "id": "nested-item-2", + "type": "Feature", + "bbox": [ + 17.91173669923782, + 10.3438851951615003, + 17.95469614953714, + 10.3690476620161975 + ], + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [17.91173669923782, 10.3438851951615003], + [17.95469614953714, 10.3438851951615003], + [17.95469614953714, 10.3690476620161975], + [17.91173669923782, 10.3690476620161975], + [17.91173669923782, 10.3438851951615003] + ] + ] + }, + "properties": { + "title": "Nested Item 2", + "description": "A sample STAC nested Item that includes examples of all common metadata", + "datetime": "2020-12-13T00:00:00Z", + "eo:cloud_cover": 1.2, + "proj:epsg": 32659, + "view:sun_elevation": 54.9, + "view:off_nadir": 3.8, + "view:sun_azimuth": 135.7 + }, + "assets": { + "visual": { + "href": "http://e84-earth-search-sentinel-data.s3/example/visual2.tif", + "title": "Visual asset 2", + "type": "image/tiff; application=geotiff", + "roles": ["visual"] + } + }, + "links": [ + { + "rel": "collection", + "href": "./nested-collection.json", + "type": "application/json", + "title": "Nested Example Collection" + } + ] + } + ] +} diff --git a/spark/common/src/test/resources/datasource_stac/simple-item.json b/spark/common/src/test/resources/datasource_stac/simple-item.json new file mode 100644 index 0000000000..277b973462 --- /dev/null +++ b/spark/common/src/test/resources/datasource_stac/simple-item.json @@ -0,0 +1,83 @@ +{ + "stac_version": "1.1.0", + "stac_extensions": [], + "type": "Feature", + "id": "20201211_223832_CS2", + "bbox": [ + 172.91173669923782, + 1.3438851951615003, + 172.95469614953714, + 1.3690476620161975 + ], + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + 172.91173669923782, + 1.3438851951615003 + ], + [ + 172.95469614953714, + 1.3438851951615003 + ], + [ + 172.95469614953714, + 1.3690476620161975 + ], + [ + 172.91173669923782, + 1.3690476620161975 + ], + [ + 172.91173669923782, + 1.3438851951615003 + ] + ] + ] + }, + "properties": { + "title": "Simple Item ", + "description": "A sample STAC nested Item that includes examples of some metadata", + "datetime": "2020-12-11T22:38:32.125000Z" + }, + "collection": "simple-collection", + "links": [ + { + "rel": "collection", + "href": "./collection.json", + "type": "application/json", + "title": "Simple Example Collection" + }, + { + "rel": "root", + "href": "./collection.json", + "type": "application/json", + "title": "Simple Example Collection" + }, + { + "rel": "parent", + "href": "./collection.json", + "type": "application/json", + "title": "Simple Example Collection" + } + ], + "assets": { + "visual": { + "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "3-Band Visual", + "roles": [ + "visual" + ] + }, + "thumbnail": { + "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2.jpg", + "title": "Thumbnail", + "type": "image/jpeg", + "roles": [ + "thumbnail" + ] + } + } +} diff --git a/spark/common/src/test/scala/org/apache/sedona/sql/GeoStatsSuite.scala b/spark/common/src/test/scala/org/apache/sedona/sql/GeoStatsSuite.scala new file mode 100644 index 0000000000..9567dcc95f --- /dev/null +++ b/spark/common/src/test/scala/org/apache/sedona/sql/GeoStatsSuite.scala @@ -0,0 +1,215 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sedona.sql + +import org.apache.sedona.stats.Weighting.{addBinaryDistanceBandColumn, addWeightedDistanceBandColumn} +import org.apache.sedona.stats.clustering.DBSCAN.dbscan +import org.apache.sedona.stats.hotspotDetection.GetisOrd.gLocal +import org.apache.sedona.stats.outlierDetection.LocalOutlierFactor.localOutlierFactor +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.{col, expr, lit} +import org.apache.spark.sql.sedona_sql.expressions.st_functions.{ST_DBSCAN, ST_LocalOutlierFactor} + +class GeoStatsSuite extends TestBaseScala { + private val spark = sparkSession + + case class Record(id: Int, x: Double, y: Double) + + def getData: DataFrame = { + spark + .createDataFrame( + Seq( + Record(10, 1.0, 1.8), + Record(11, 1.0, 1.9), + Record(12, 1.0, 2.0), + Record(13, 1.0, 2.1), + Record(14, 2.0, 2.0), + Record(15, 3.0, 1.9), + Record(16, 3.0, 2.0), + Record(17, 3.0, 2.1), + Record(18, 3.0, 2.2))) + .withColumn("geometry", expr("ST_Point(x, y)")) + } + + it("test dbscan function") { + dbscan(getData.withColumn("sql_results", expr("ST_DBSCAN(geometry, 1.0, 4, false)")), 1.0, 4) + .where("sql_results.cluster = cluster and sql_results.isCore = isCore") + .count() == getData.count() + } + + it("test dbscan function df method") { + dbscan( + getData.withColumn("sql_results", ST_DBSCAN(col("geometry"), lit(1.0), lit(4), lit(false))), + 1.0, + 4) + .where("sql_results.cluster = cluster and sql_results.isCore = isCore") + .count() == getData.count() + } + + it("test dbscan function with distance column") { + dbscan( + getData.withColumn("sql_results", expr("ST_DBSCAN(geometry, 1.0, 4, true)")), + 1.0, + 4, + useSpheroid = true) + .where("sql_results.cluster = cluster and sql_results.isCore = isCore") + .count() == getData.count() + } + + it("test dbscan function with scalar subquery") { + dbscan( + getData.withColumn( + "sql_results", + expr("ST_DBSCAN(geometry, (SELECT ARRAY(1.0, 2.0)[0]), 4, false)")), + 1.0, + 4) + .where("sql_results.cluster = cluster and sql_results.isCore = isCore") + .count() == getData.count() + } + + it("test dbscan with geom literal") { + val error = intercept[IllegalArgumentException] { + spark.sql("SELECT ST_DBSCAN(ST_GeomFromWKT('POINT(0.0 1.1)'), 1.0, 4, false)").collect() + } + assert( + error + .asInstanceOf[IllegalArgumentException] + .getMessage == "geometry argument must be a named reference to an existing column") + } + + it("test dbscan with minPts variable") { + val error = intercept[IllegalArgumentException] { + getData + .withColumn("result", ST_DBSCAN(col("geometry"), lit(1.0), col("id"), lit(false))) + .collect() + } + + assert( + error + .asInstanceOf[IllegalArgumentException] + .getMessage + .contains("minPts must be a scalar value")) + } + + it("test lof") { + localOutlierFactor( + getData.withColumn("sql_result", expr("ST_LocalOutlierFactor(geometry, 4, false)")), + 4) + .where("sql_result = lof") + .count() == getData.count() + } + + it("test lof with dataframe method") { + localOutlierFactor( + getData.withColumn( + "sql_result", + ST_LocalOutlierFactor(col("geometry"), lit(4), lit(false))), + 4) + .where("sql_result = lof") + .count() == getData.count() + } + + it("test geostats function in another function") { + getData + .withColumn("sql_result", expr("SQRT(ST_LocalOutlierFactor(geometry, 4, false))")) + .collect() + } + + it("test DBSCAN with a column named __isCore in input df") { + val exception = intercept[IllegalArgumentException] { + getData + .withColumn("__isCore", lit(1)) + .withColumn("sql_result", expr("ST_DBSCAN(geometry, 0.1, 4, false)")) + .collect() + } + assert( + exception.getMessage == "requirement failed: __isCore is a reserved name by the dbscan algorithm. Please rename the columns before calling the ST_DBSCAN function.") + } + + it("test ST_BinaryDistanceBandColumn") { + val weightedDf = getData + .withColumn( + "someWeights", + expr( + "array_sort(ST_BinaryDistanceBandColumn(geometry, 1.0, true, true, false, struct(id, geometry)))")) + + val resultsDf = addBinaryDistanceBandColumn( + weightedDf, + 1.0, + true, + true, + savedAttributes = Seq("id", "geometry")) + .withColumn("weights", expr("array_sort(weights)")) + .where("someWeights = weights") + + assert(resultsDf.count == weightedDf.count()) + } + + it("test ST_WeightedDistanceBandColumn") { + val weightedDf = getData + .withColumn( + "someWeights", + expr( + "array_sort(ST_WeightedDistanceBandColumn(geometry, 1.0, -1.0, true, true, 1.0, false, struct(id, geometry)))")) + + val resultsDf = addWeightedDistanceBandColumn( + weightedDf, + 1.0, + -1.0, + true, + true, + savedAttributes = Seq("id", "geometry"), + selfWeight = 1.0) + .withColumn("weights", expr("array_sort(weights)")) + .where("someWeights = weights") + + assert(resultsDf.count == weightedDf.count()) + } + + it("test GI with ST_BinaryDistanceBandColumn") { + val weightedDf = getData + .withColumn( + "someWeights", + expr( + "ST_BinaryDistanceBandColumn(geometry, 1.0, true, true, false, struct(id, geometry))")) + + val giDf = weightedDf + .withColumn("gi", expr("ST_GLocal(id, someWeights, true)")) + assert( + gLocal(giDf, "id", weights = "someWeights", star = true) + .where("G = gi.G") + .count() == weightedDf.count()) + } + + it("test nested ST_Geostats calls with getis ord") { + getData + .withColumn( + "GI", + expr( + "ST_GLocal(id, ST_BinaryDistanceBandColumn(geometry, 1.0, true, true, false, struct(id, geometry)), true)")) + .collect() + } + + it("test ST_Geostats with string column") { + getData + .withColumn("someString", lit("test")) + .withColumn("sql_results", expr("ST_DBSCAN(geometry, 1.0, 4, false)")) + .collect() + } +} diff --git a/spark/common/src/test/scala/org/apache/sedona/sql/KnnJoinSuite.scala b/spark/common/src/test/scala/org/apache/sedona/sql/KnnJoinSuite.scala index 1d6119d02d..ab2c64898a 100644 --- a/spark/common/src/test/scala/org/apache/sedona/sql/KnnJoinSuite.scala +++ b/spark/common/src/test/scala/org/apache/sedona/sql/KnnJoinSuite.scala @@ -209,6 +209,22 @@ class KnnJoinSuite extends TestBaseScala with TableDrivenPropertyChecks { "[1,3][1,6][1,13][1,16][2,1][2,5][2,11][2,15][3,3][3,9][3,13][3,19]") } + it("KNN Join should verify the correct parameter k is passed to the join function") { + val df = sparkSession + .range(0, 1) + .toDF("id") + .withColumn("geom", expr("ST_Point(id, id)")) + .repartition(1) + df.createOrReplaceTempView("df1") + val exception = intercept[IllegalArgumentException] { + sparkSession + .sql(s"SELECT A.ID, B.ID FROM df1 A JOIN df1 B ON ST_KNN(A.GEOM, B.GEOM, 0, false)") + .collect() + } + exception.getMessage should include( + "The number of neighbors (k) must be equal or greater than 1.") + } + it("KNN Join with exact algorithms with additional join conditions on id") { val df = sparkSession.sql( s"SELECT QUERIES.ID, OBJECTS.ID FROM QUERIES JOIN OBJECTS ON ST_KNN(QUERIES.GEOM, OBJECTS.GEOM, 4, false) AND QUERIES.ID > 1") @@ -425,6 +441,23 @@ class KnnJoinSuite extends TestBaseScala with TableDrivenPropertyChecks { resultAll.mkString should be("[0,6][0,7]") } } + + it("KNN Join with exact algorithms should not fail with null geometries") { + val df1 = sparkSession.sql( + "SELECT ST_GeomFromText(col1) as geom1 from values ('POINT (0.0 0.0)'), (null)") + val df2 = sparkSession.sql("SELECT ST_Point(0.0, 0.0) as geom2") + df1.cache() + df2.cache() + df1.join(df2, expr("ST_KNN(geom1, geom2, 1)")).count() should be(1) + } + + it("KNN Join with exact algorithms should not fail with empty geometries") { + val df1 = sparkSession.sql("SELECT ST_GeomFromText('POINT EMPTY') as geom1") + val df2 = sparkSession.sql("SELECT ST_Point(0.0, 0.0) as geom2") + df1.cache() + df2.cache() + df1.join(df2, expr("ST_KNN(geom1, geom2, 1)")).count() should be(0) + } } private def withOptimizationMode(mode: String)(body: => Unit): Unit = { diff --git a/spark/common/src/test/scala/org/apache/sedona/sql/PreserveSRIDSuite.scala b/spark/common/src/test/scala/org/apache/sedona/sql/PreserveSRIDSuite.scala index ea6092629f..02fdb3149b 100644 --- a/spark/common/src/test/scala/org/apache/sedona/sql/PreserveSRIDSuite.scala +++ b/spark/common/src/test/scala/org/apache/sedona/sql/PreserveSRIDSuite.scala @@ -54,10 +54,12 @@ class PreserveSRIDSuite extends TestBaseScala with TableDrivenPropertyChecks { ("ST_SimplifyVW(geom1, 0.1)", 1000), ("ST_SimplifyPolygonHull(geom1, 0.5)", 1000), ("ST_SetSRID(geom1, 2000)", 2000), + ("ST_LabelPoint(geom1)", 1000), ("ST_LineMerge(geom2)", 1000), ("ST_StartPoint(geom3)", 1000), ("ST_Snap(geom3, geom3, 0.1)", 1000), ("ST_Boundary(geom1)", 1000), + ("ST_LineSegments(geom3)[0]", 1000), ("ST_LineSubstring(geom3, 0.1, 0.9)", 1000), ("ST_LineInterpolatePoint(geom3, 0.1)", 1000), ("ST_EndPoint(geom3)", 1000), diff --git a/spark/common/src/test/scala/org/apache/sedona/sql/dataFrameAPITestScala.scala b/spark/common/src/test/scala/org/apache/sedona/sql/dataFrameAPITestScala.scala index cf9b8a0f7a..a89af2355d 100644 --- a/spark/common/src/test/scala/org/apache/sedona/sql/dataFrameAPITestScala.scala +++ b/spark/common/src/test/scala/org/apache/sedona/sql/dataFrameAPITestScala.scala @@ -411,6 +411,38 @@ class dataFrameAPITestScala extends TestBaseScala { } // functions + it("Passed ST_LabelPoint") { + var geomDf = sparkSession.sql( + "SELECT ST_GeomFromWKT('POLYGON ((-112.637484 33.440546, -112.546852 33.477209, -112.489177 33.550488, -112.41777 33.751684, -111.956371 33.719707, -111.766868 33.616843, -111.775107 33.527595, -111.640533 33.504695, -111.440044 33.463462, -111.415326 33.374055, -111.514197 33.309809, -111.643279 33.222542, -111.893203 33.174278, -111.96461 33.250109, -112.123903 33.261593, -112.252985 33.35341, -112.406784 33.346527, -112.667694 33.316695, -112.637484 33.440546))') AS geom, 2.0 AS gridResolution, 0.2 AS goodnessThreshold") + var result = geomDf.select( + ST_LabelPoint(col("geom"), col("gridResolution"), col("goodnessThreshold")).as("geom")) + var actualResult = result.take(1)(0).get(0).asInstanceOf[Geometry].toText() + var expected = "POINT (-112.04278737349767 33.46420809489905)" + assertEquals(expected, actualResult) + + geomDf = sparkSession.sql( + "SELECT ST_GeomFromWKT('GEOMETRYCOLLECTION(POLYGON ((-112.840785 33.435962, -112.840785 33.708284, -112.409597 33.708284, -112.409597 33.435962, -112.840785 33.435962)), POLYGON ((-112.309264 33.398167, -112.309264 33.746007, -111.787444 33.746007, -111.787444 33.398167, -112.309264 33.398167)))') AS geom") + geomDf.createOrReplaceTempView("geomDf") + result = geomDf.select(ST_LabelPoint(col("geom"), lit(1)).as("geom")) + actualResult = result.take(1)(0).get(0).asInstanceOf[Geometry].toText() + expected = "POINT (-112.04835399999999 33.57208699999999)" + assertEquals(expected, actualResult) + + geomDf = sparkSession.sql( + "SELECT ST_GeomFromWKT('POLYGON ((-112.654072 33.114485, -112.313516 33.653431, -111.63515 33.314399, -111.497829 33.874913, -111.692825 33.431378, -112.376684 33.788215, -112.654072 33.114485))') AS geom, 0.01 AS goodnessThreshold") + geomDf.createOrReplaceTempView("geomDf") + result = + geomDf.select(ST_LabelPoint(col("geom"), lit(2), col("goodnessThreshold")).as("geom")) + actualResult = result.take(1)(0).get(0).asInstanceOf[Geometry].toText() + expected = "POINT (-112.0722602222832 33.53914975012836)" + assertEquals(expected, actualResult) + + result = geomDf.select(ST_LabelPoint(col("geom")).as("geom")) + actualResult = result.take(1)(0).get(0).asInstanceOf[Geometry].toText() + expected = "POINT (-112.0722602222832 33.53914975012836)" + assertEquals(expected, actualResult) + } + it("Passed ST_ConcaveHull") { val baseDF = sparkSession.sql( "SELECT ST_GeomFromWKT('Polygon ((0 0, 1 2, 2 2, 3 2, 5 0, 4 0, 3 1, 2 1, 1 0, 0 0))') as mline") @@ -779,6 +811,25 @@ class dataFrameAPITestScala extends TestBaseScala { assertEquals(expected, actual) } + it("Passed ST_Perimeter2D") { + var baseDf = sparkSession.sql( + "SELECT ST_GeomFromWKT('POLYGON((743238 2967416,743238 2967450,743265 2967450,743265.625 2967416,743238 2967416))') AS geom") + var actual = baseDf.select(ST_Perimeter2D("geom")).first().get(0) + var expected = 122.63074400009504 + assertEquals(expected, actual) + + baseDf = sparkSession.sql( + "SELECT ST_GeomFromWKT('POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))', 4326) AS geom") + actual = baseDf.select(ST_Perimeter2D("geom", use_spheroid = true)).first().get(0) + expected = 443770.91724830196 + assertEquals(expected, actual) + + actual = + baseDf.select(ST_Perimeter2D("geom", use_spheroid = true, lenient = false)).first().get(0) + expected = 443770.91724830196 + assertEquals(expected, actual) + } + it("Passed ST_Project") { val baseDf = sparkSession.sql( "SELECT ST_GeomFromWKT('POINT(0 0)') as point, ST_MakeEnvelope(0, 1, 2, 0) as poly") @@ -1399,6 +1450,22 @@ class dataFrameAPITestScala extends TestBaseScala { assert(actualRadius == expectedRadius) } + it("Passed ST_LineSegments") { + val baseDf = sparkSession.sql( + "SELECT ST_GeomFromWKT('LINESTRING(120 140, 60 120, 30 20)') AS line, ST_GeomFromWKT('POLYGON ((0 0, 0 1, 1 0, 0 0))') AS poly") + var resultSize = baseDf + .select(ST_LineSegments("line", false)) + .first() + .getAs[WrappedArray[Geometry]](0) + .length + val expected = 2 + assertEquals(expected, resultSize) + + resultSize = + baseDf.select(ST_LineSegments("poly")).first().getAs[WrappedArray[Geometry]](0).length + assertEquals(0, resultSize) + } + it("Passed ST_LineSubstring") { val baseDf = sparkSession.sql("SELECT ST_GeomFromWKT('LINESTRING (0 0, 2 0)') AS line") val df = baseDf.select(ST_LineSubstring("line", 0.5, 1.0)) diff --git a/spark/common/src/test/scala/org/apache/sedona/sql/functionTestScala.scala b/spark/common/src/test/scala/org/apache/sedona/sql/functionTestScala.scala index 82dbff82c1..84770157c7 100644 --- a/spark/common/src/test/scala/org/apache/sedona/sql/functionTestScala.scala +++ b/spark/common/src/test/scala/org/apache/sedona/sql/functionTestScala.scala @@ -49,6 +49,36 @@ class functionTestScala describe("Sedona-SQL Function Test") { + it("Passed ST_LabelPoint") { + var geomDf = sparkSession.sql( + "SELECT ST_GeomFromWKT('POLYGON ((-112.637484 33.440546, -112.546852 33.477209, -112.489177 33.550488, -112.41777 33.751684, -111.956371 33.719707, -111.766868 33.616843, -111.775107 33.527595, -111.640533 33.504695, -111.440044 33.463462, -111.415326 33.374055, -111.514197 33.309809, -111.643279 33.222542, -111.893203 33.174278, -111.96461 33.250109, -112.123903 33.261593, -112.252985 33.35341, -112.406784 33.346527, -112.667694 33.316695, -112.637484 33.440546))') AS geom, 2 AS gridResolution, 0.2 AS GoodnessThreshold") + geomDf.createOrReplaceTempView("geomDf") + var result = + sparkSession.sql( + "SELECT ST_AsEWKT(ST_LabelPoint(geom, gridResolution, goodnessThreshold)) FROM geomDf") + var expected = "POINT (-112.04278737349767 33.46420809489905)" + assertEquals(expected, result.take(1)(0).get(0).asInstanceOf[String]) + + geomDf = sparkSession.sql( + "SELECT ST_GeomFromWKT('GEOMETRYCOLLECTION(POLYGON ((-112.840785 33.435962, -112.840785 33.708284, -112.409597 33.708284, -112.409597 33.435962, -112.840785 33.435962)), POLYGON ((-112.309264 33.398167, -112.309264 33.746007, -111.787444 33.746007, -111.787444 33.398167, -112.309264 33.398167)))') AS geom") + geomDf.createOrReplaceTempView("geomDf") + result = sparkSession.sql("SELECT ST_AsEWKT(ST_LabelPoint(geom, 1)) FROM geomDf") + expected = "POINT (-112.04835399999999 33.57208699999999)" + assertEquals(expected, result.take(1)(0).get(0).asInstanceOf[String]) + + geomDf = sparkSession.sql( + "SELECT ST_GeomFromWKT('POLYGON ((-112.654072 33.114485, -112.313516 33.653431, -111.63515 33.314399, -111.497829 33.874913, -111.692825 33.431378, -112.376684 33.788215, -112.654072 33.114485))') AS geom, 0.01 AS goodnessThreshold") + geomDf.createOrReplaceTempView("geomDf") + result = sparkSession.sql( + "SELECT ST_AsEWKT(ST_LabelPoint(geom, 2, goodnessThreshold)) FROM geomDf") + expected = "POINT (-112.0722602222832 33.53914975012836)" + assertEquals(expected, result.take(1)(0).get(0).asInstanceOf[String]) + + result = sparkSession.sql("SELECT ST_AsEWKT(ST_LabelPoint(geom)) FROM geomDf") + expected = "POINT (-112.0722602222832 33.53914975012836)" + assertEquals(expected, result.take(1)(0).get(0).asInstanceOf[String]) + } + it("Passed ST_ConcaveHull") { var polygonWktDf = sparkSession.read .format("csv") @@ -613,6 +643,24 @@ class functionTestScala assertEquals(expected, actual) } + it("Passed ST_Perimeter2D") { + var baseDf = sparkSession.sql( + "SELECT ST_GeomFromWKT('POLYGON((743238 2967416,743238 2967450,743265 2967450,743265.625 2967416,743238 2967416))') AS geom") + var actual = baseDf.selectExpr("ST_Perimeter2D(geom)").first().get(0) + var expected = 122.63074400009504 + assertEquals(expected, actual) + + baseDf = sparkSession.sql( + "SELECT ST_GeomFromWKT('POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))', 4326) AS geom") + actual = baseDf.selectExpr("ST_Perimeter2D(geom, true)").first().get(0) + expected = 443770.91724830196 + assertEquals(expected, actual) + + actual = baseDf.selectExpr("ST_Perimeter2D(geom, true, false)").first().get(0) + expected = 443770.91724830196 + assertEquals(expected, actual) + } + it("Passed ST_Points") { val testtable = sparkSession.sql( @@ -1900,6 +1948,13 @@ class functionTestScala .get(0) expected = "LINESTRING (0 0, 5 5, 2 2)" assertEquals(expected, actual) + + actual = sparkSession + .sql("SELECT ST_AsText(ST_RemoveRepeatedPoints(ST_GeomFromWKT('POLYGON ((40 40, 70 70, 70 70, 40 40))')))") + .first() + .get(0) + expected = "POLYGON ((40 40, 70 70, 70 70, 40 40))" + assertEquals(expected, actual) } it("Should correctly set using ST_SetPoint") { @@ -2293,6 +2348,17 @@ class functionTestScala .toList should contain theSameElementsAs List(0, 1, 1) } + it("Should pass ST_LineSegments") { + val baseDf = sparkSession.sql( + "SELECT ST_GeomFromWKT('LINESTRING(120 140, 60 120, 30 20)') AS line, ST_GeomFromWKT('POLYGON ((0 0, 0 1, 1 0, 0 0))') AS poly") + var resultSize = baseDf.selectExpr("array_size(ST_LineSegments(line, false))").first().get(0) + val expected = 2 + assertEquals(expected, resultSize) + + resultSize = baseDf.selectExpr("array_size(ST_LineSegments(poly))").first().get(0) + assertEquals(0, resultSize) + } + it("Should pass ST_LineSubstring") { Given("Sample geometry dataframe") val geometryTable = Seq("LINESTRING(25 50, 100 125, 150 190)") diff --git a/spark/common/src/test/scala/org/apache/sedona/sql/structuredAdapterTestScala.scala b/spark/common/src/test/scala/org/apache/sedona/sql/structuredAdapterTestScala.scala new file mode 100644 index 0000000000..d258ce3b40 --- /dev/null +++ b/spark/common/src/test/scala/org/apache/sedona/sql/structuredAdapterTestScala.scala @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sedona.sql + +import org.apache.sedona.core.enums.{GridType, IndexType} +import org.apache.sedona.core.spatialOperator.{JoinQuery, SpatialPredicate} +import org.apache.sedona.core.spatialRDD.CircleRDD +import org.apache.spark.sql.functions.spark_partition_id +import org.apache.spark.sql.Row +import org.apache.spark.sql.sedona_sql.adapters.StructuredAdapter +import org.junit.Assert.assertEquals +import org.scalatest.GivenWhenThen + +class structuredAdapterTestScala extends TestBaseScala with GivenWhenThen { + + describe("Structured Adapter") { + it("Should convert DataFrame to SpatialRDD and back") { + val seq = generateTestData() + val geom1 = seq.head._3 + val dfOrigin = sparkSession.createDataFrame(seq) + val rdd = StructuredAdapter.toSpatialRdd(dfOrigin, "_3") + assertGeometryEquals(geom1, rdd.rawSpatialRDD.take(1).get(0)) + val dfConverted = StructuredAdapter.toDf(rdd, sparkSession) + intercept[RuntimeException] { + StructuredAdapter.toSpatialPartitionedDf(rdd, sparkSession) + } + assertEquals(seq.size, dfConverted.count()) + } + + it("Should convert DataFrame to SpatialRDD and back, without specifying geometry column") { + val seq = generateTestData() + val geom1 = seq.head._3 + val dfOrigin = sparkSession.createDataFrame(seq) + val rdd = StructuredAdapter.toSpatialRdd(dfOrigin) + assertGeometryEquals(geom1, rdd.rawSpatialRDD.take(1).get(0)) + val dfConverted = StructuredAdapter.toDf(rdd, sparkSession) + intercept[RuntimeException] { + StructuredAdapter.toSpatialPartitionedDf(rdd, sparkSession) + } + assertEquals(seq.size, dfConverted.count()) + } + + it("Should convert to Rdd and do spatial partitioning") { + val seq = generateTestData() + val dfOrigin = sparkSession.createDataFrame(seq) + val rdd = StructuredAdapter.toSpatialRdd(dfOrigin, "_3") + rdd.analyze() + rdd.spatialPartitioning(GridType.KDBTREE, 10) + val dfConverted = StructuredAdapter.toSpatialPartitionedDf(rdd, sparkSession) + assertEquals(seq.size, dfConverted.count()) + } + + it("Should convert a spatial join result back to DataFrame") { + val pointRdd = + StructuredAdapter.toSpatialRdd(sparkSession.createDataFrame(generateTestData())) + val circleRDD = new CircleRDD(pointRdd, 0.0001) + circleRDD.analyze() + pointRdd.analyze() + circleRDD.spatialPartitioning(GridType.KDBTREE) + pointRdd.spatialPartitioning(circleRDD.getPartitioner) + circleRDD.buildIndex(IndexType.QUADTREE, true) + val pairRdd = + JoinQuery.DistanceJoinQueryFlat(pointRdd, circleRDD, true, SpatialPredicate.INTERSECTS) + var resultDf = + StructuredAdapter.toDf(pairRdd, pointRdd.schema, pointRdd.schema, sparkSession) + assertEquals(pointRdd.rawSpatialRDD.count(), resultDf.count()) + resultDf = + StructuredAdapter.toDf(pairRdd, pointRdd.schema.json, pointRdd.schema.json, sparkSession) + assertEquals(pointRdd.rawSpatialRDD.count(), resultDf.count()) + } + + it("Should convert a SpatialRdd to RowRdd and back") { + val seq = generateTestData() + val dfOrigin = sparkSession.createDataFrame(seq) + val spatialRdd = StructuredAdapter.toSpatialRdd(dfOrigin.rdd) + val rowRdd = StructuredAdapter.toRowRdd(spatialRdd) + assertEquals(seq.size, StructuredAdapter.toSpatialRdd(rowRdd).rawSpatialRDD.count()) + } + + it("Should not be able to convert an empty Row RDD to SpatialRDD if schema is not provided") { + val rdd = sparkSession.sparkContext.parallelize(Seq.empty[Row]) + intercept[IllegalArgumentException] { + StructuredAdapter.toSpatialRdd(rdd) + } + } + + it("Should convert an empty Row RDD to SpatialRDD if schema is provided") { + val rdd = sparkSession.sparkContext.parallelize(Seq.empty[Row]) + val spatialRdd = StructuredAdapter.toSpatialRdd(rdd, null) + assertEquals(0, spatialRdd.rawSpatialRDD.count()) + assertEquals(0, spatialRdd.schema.size) + } + + it("can convert spatial RDD to Dataframe preserving spatial partitioning") { + var pointCsvDF = sparkSession.read + .format("csv") + .option("delimiter", ",") + .option("header", "false") + .load(csvPointInputLocation) + pointCsvDF.createOrReplaceTempView("pointtable") + var pointDf = sparkSession.sql( + "select ST_Point(cast(pointtable._c0 as Decimal(24,20)), cast(pointtable._c1 as Decimal(24,20))) as arealandmark from pointtable") + var srcRdd = StructuredAdapter.toSpatialRdd(pointDf, "arealandmark") + srcRdd.analyze() + srcRdd.spatialPartitioning(GridType.KDBTREE, 16) + var numSpatialPartitions = srcRdd.spatialPartitionedRDD.getNumPartitions + assert(numSpatialPartitions >= 16) + + var partitionedDF = StructuredAdapter.toSpatialPartitionedDf(srcRdd, sparkSession) + val dfPartitions: Long = partitionedDF.select(spark_partition_id).distinct().count() + assert(dfPartitions == numSpatialPartitions) + } + } +} diff --git a/spark/common/src/test/scala/org/apache/spark/sql/sedona_sql/io/stac/StacBatchTest.scala b/spark/common/src/test/scala/org/apache/spark/sql/sedona_sql/io/stac/StacBatchTest.scala new file mode 100644 index 0000000000..0765b3950f --- /dev/null +++ b/spark/common/src/test/scala/org/apache/spark/sql/sedona_sql/io/stac/StacBatchTest.scala @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.stac + +import org.apache.sedona.sql.TestBaseScala +import org.apache.spark.sql.connector.read.InputPartition +import org.apache.spark.sql.types.StructType + +import scala.io.Source +import scala.collection.mutable + +class StacBatchTest extends TestBaseScala { + + def loadJsonFromResource(resourceFilePath: String): String = { + Source.fromResource(resourceFilePath).getLines().mkString("\n") + } + + def getAbsolutePathOfResource(resourceFilePath: String): String = { + val resourceUrl = getClass.getClassLoader.getResource(resourceFilePath) + if (resourceUrl != null) { + resourceUrl.getPath + } else { + throw new IllegalArgumentException(s"Resource not found: $resourceFilePath") + } + } + + it("planInputPartitions should create correct number of partitions") { + val stacCollectionJson = + """ + |{ + | "stac_version": "1.0.0", + | "id": "sample-collection", + | "description": "A sample STAC collection", + | "links": [ + | {"rel": "item", "href": "https://path/to/item1.json"}, + | {"rel": "item", "href": "https://path/to/item2.json"}, + | {"rel": "item", "href": "https://path/to/item3.json"} + | ] + |} + """.stripMargin + + val opts = mutable.Map("numPartitions" -> "2").toMap + val collectionUrl = "https://path/to/collection.json" + + val stacBatch = + StacBatch(collectionUrl, stacCollectionJson, StructType(Seq()), opts, None, None) + val partitions: Array[InputPartition] = stacBatch.planInputPartitions() + + assert(partitions.length == 2) + assert(partitions(0).asInstanceOf[StacPartition].items.length == 2) + assert(partitions(1).asInstanceOf[StacPartition].items.length == 1) + } + + it("planInputPartitions should handle empty links array") { + val stacCollectionJson = + """ + |{ + | "links": [] + |} + """.stripMargin + + val opts = mutable.Map("numPartitions" -> "2").toMap + val collectionUrl = "https://path/to/collection.json" + + val stacBatch = + StacBatch(collectionUrl, stacCollectionJson, StructType(Seq()), opts, None, None) + val partitions: Array[InputPartition] = stacBatch.planInputPartitions() + + assert(partitions.isEmpty) + } + + it("planInputPartitions should create correct number of partitions with real collection.json") { + val rootJsonFile = "datasource_stac/collection.json" + val stacCollectionJson = loadJsonFromResource(rootJsonFile) + val opts = mutable.Map("numPartitions" -> "3").toMap + val collectionUrl = getAbsolutePathOfResource(rootJsonFile) + + val stacBatch = + StacBatch(collectionUrl, stacCollectionJson, StructType(Seq()), opts, None, None) + val partitions: Array[InputPartition] = stacBatch.planInputPartitions() + + assert(partitions.length == 3) + assert(partitions(0).asInstanceOf[StacPartition].items.length == 2) + assert(partitions(1).asInstanceOf[StacPartition].items.length == 2) + assert(partitions(2).asInstanceOf[StacPartition].items.length == 1) + } +} diff --git a/spark/common/src/test/scala/org/apache/spark/sql/sedona_sql/io/stac/StacDataSourceTest.scala b/spark/common/src/test/scala/org/apache/spark/sql/sedona_sql/io/stac/StacDataSourceTest.scala new file mode 100644 index 0000000000..a1234ffa11 --- /dev/null +++ b/spark/common/src/test/scala/org/apache/spark/sql/sedona_sql/io/stac/StacDataSourceTest.scala @@ -0,0 +1,209 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.stac + +import org.apache.sedona.sql.TestBaseScala +import org.apache.spark.sql.sedona_sql.UDT.{GeometryUDT, RasterUDT} +import org.apache.spark.sql.types.{ArrayType, DoubleType, MapType, StringType, StructField, StructType, TimestampType} +import org.scalatest.BeforeAndAfterAll + +import java.util.TimeZone + +class StacDataSourceTest extends TestBaseScala { + + val STAC_COLLECTION_LOCAL: String = resourceFolder + "datasource_stac/collection.json" + val STAC_ITEM_LOCAL: String = resourceFolder + "geojson/core-item.json" + + val STAC_COLLECTION_REMOTE: List[String] = List( + "https://earth-search.aws.element84.com/v1/collections/sentinel-2-pre-c1-l2a", + "https://storage.googleapis.com/cfo-public/vegetation/collection.json", + "https://storage.googleapis.com/cfo-public/wildfire/collection.json", + "https://earthdatahub.destine.eu/api/stac/v1/collections/copernicus-dem", + "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip") + + it("basic df load from local file should work") { + val dfStac = sparkSession.read.format("stac").load(STAC_COLLECTION_LOCAL) + val rowCount = dfStac.count() + assert(rowCount > 0) + } + + it("basic df load from remote service endpoints should work") { + STAC_COLLECTION_REMOTE.foreach { endpoint => + val dfStac = sparkSession.read.format("stac").load(endpoint) + assertSchema(dfStac.schema) + } + } + + it("normal select SQL without any filter") { + val dfStac = sparkSession.read.format("stac").load(STAC_COLLECTION_LOCAL) + dfStac.createOrReplaceTempView("STACTBL") + + val dfSelect = + sparkSession.sql("SELECT id, datetime as dt, geometry, bbox FROM STACTBL") + + assert(dfSelect.schema.fieldNames.contains("id")) + assert(dfSelect.schema.fieldNames.contains("dt")) + assert(dfSelect.schema.fieldNames.contains("geometry")) + assert(dfSelect.schema.fieldNames.contains("bbox")) + + val rowCount = dfSelect.count() + assert(rowCount == 6) + } + + it("select SQL with filter on datetime") { + val dfStac = sparkSession.read.format("stac").load(STAC_COLLECTION_LOCAL) + dfStac.createOrReplaceTempView("STACTBL") + + val dfSelect = sparkSession.sql( + "SELECT id, datetime as dt, geometry, bbox " + + "FROM STACTBL " + + "WHERE datetime BETWEEN '2020-01-01T00:00:00Z' AND '2020-12-13T00:00:00Z'") + + val physicalPlan = dfSelect.queryExecution.executedPlan.toString() + assert(physicalPlan.contains( + "PushedTemporalFilters -> AndFilter(GreaterThanFilter(datetime,2020-01-01T00:00),LessThanFilter(datetime,2020-12-13T00:00))")) + + val rowCount = dfSelect.count() + assert(rowCount == 4) + } + + it("select SQL with spatial filter") { + val dfStac = sparkSession.read.format("stac").load(STAC_COLLECTION_LOCAL) + dfStac.createOrReplaceTempView("STACTBL") + + val dfSelect = sparkSession.sql( + "SELECT id, geometry " + + "FROM STACTBL " + + "WHERE st_contains(ST_GeomFromText('POLYGON((17 10, 18 10, 18 11, 17 11, 17 10))'), geometry)") + + val physicalPlan = dfSelect.queryExecution.executedPlan.toString() + assert(physicalPlan.contains( + "PushedSpatialFilters -> LeafFilter(geometry,INTERSECTS,POLYGON ((17 10, 18 10, 18 11, 17 11, 17 10)))")) + + val rowCount = dfSelect.count() + assert(rowCount == 3) + } + + it("select SQL with both spatial and temporal filters") { + val dfStac = sparkSession.read.format("stac").load(STAC_COLLECTION_LOCAL) + dfStac.createOrReplaceTempView("STACTBL") + + val dfSelect = sparkSession.sql("SELECT id, datetime as dt, geometry, bbox " + + "FROM STACTBL " + + "WHERE datetime BETWEEN '2020-01-01T00:00:00Z' AND '2020-12-13T00:00:00Z' " + + "AND st_contains(ST_GeomFromText('POLYGON((17 10, 18 10, 18 11, 17 11, 17 10))'), geometry)") + + val physicalPlan = dfSelect.queryExecution.executedPlan.toString() + assert(physicalPlan.contains( + "PushedSpatialFilters -> LeafFilter(geometry,INTERSECTS,POLYGON ((17 10, 18 10, 18 11, 17 11, 17 10)))")) + assert(physicalPlan.contains( + "PushedTemporalFilters -> AndFilter(GreaterThanFilter(datetime,2020-01-01T00:00),LessThanFilter(datetime,2020-12-13T00:00))")) + + val rowCount = dfSelect.count() + assert(rowCount == 3) + } + + it("select SQL with regular filter on id") { + val dfStac = sparkSession.read.format("stac").load(STAC_COLLECTION_LOCAL) + dfStac.createOrReplaceTempView("STACTBL") + + val dfSelect = sparkSession.sql( + "SELECT id, datetime as dt, geometry, bbox " + + "FROM STACTBL " + + "WHERE id = 'some-id'") + + val physicalPlan = dfSelect.queryExecution.executedPlan.toString() + assert(physicalPlan.contains("PushedSpatialFilters -> None, PushedTemporalFilters -> None")) + + val rowCount = dfSelect.count() + assert(rowCount == 0) + } + + it("select SQL with regular, spatial, and temporal filters") { + val dfStac = sparkSession.read.format("stac").load(STAC_COLLECTION_LOCAL) + dfStac.createOrReplaceTempView("STACTBL") + + val dfSelect = sparkSession.sql("SELECT id, datetime as dt, geometry, bbox " + + "FROM STACTBL " + + "WHERE id = 'some-id' " + + "AND datetime BETWEEN '2020-01-01T00:00:00Z' AND '2020-12-13T00:00:00Z' " + + "AND st_contains(ST_GeomFromText('POLYGON((17 10, 18 10, 18 11, 17 11, 17 10))'), geometry)") + + val physicalPlan = dfSelect.queryExecution.executedPlan.toString() + assert(physicalPlan.contains( + "PushedSpatialFilters -> LeafFilter(geometry,INTERSECTS,POLYGON ((17 10, 18 10, 18 11, 17 11, 17 10)))")) + assert(physicalPlan.contains( + "PushedTemporalFilters -> AndFilter(GreaterThanFilter(datetime,2020-01-01T00:00),LessThanFilter(datetime,2020-12-13T00:00))")) + + val rowCount = dfSelect.count() + assert(rowCount == 0) + } + + def assertSchema(actualSchema: StructType): Unit = { + val expectedSchema = StructType( + Seq( + StructField("stac_version", StringType, nullable = false), + StructField( + "stac_extensions", + ArrayType(StringType, containsNull = true), + nullable = true), + StructField("type", StringType, nullable = false), + StructField("id", StringType, nullable = false), + StructField("bbox", ArrayType(DoubleType, containsNull = true), nullable = true), + StructField("geometry", new GeometryUDT(), nullable = true), + StructField("title", StringType, nullable = true), + StructField("description", StringType, nullable = true), + StructField("datetime", TimestampType, nullable = true), + StructField("start_datetime", TimestampType, nullable = true), + StructField("end_datetime", TimestampType, nullable = true), + StructField("created", TimestampType, nullable = true), + StructField("updated", TimestampType, nullable = true), + StructField("platform", StringType, nullable = true), + StructField("instruments", ArrayType(StringType, containsNull = true), nullable = true), + StructField("constellation", StringType, nullable = true), + StructField("mission", StringType, nullable = true), + StructField("gsd", DoubleType, nullable = true), + StructField("collection", StringType, nullable = true), + StructField( + "links", + ArrayType( + StructType(Seq( + StructField("rel", StringType, nullable = true), + StructField("href", StringType, nullable = true), + StructField("type", StringType, nullable = true), + StructField("title", StringType, nullable = true))), + containsNull = true), + nullable = true), + StructField( + "assets", + MapType( + StringType, + StructType(Seq( + StructField("href", StringType, nullable = true), + StructField("type", StringType, nullable = true), + StructField("title", StringType, nullable = true), + StructField("roles", ArrayType(StringType, containsNull = true), nullable = true))), + valueContainsNull = true), + nullable = true))) + + assert( + actualSchema == expectedSchema, + s"Schema does not match. Expected: $expectedSchema, Actual: $actualSchema") + } +} diff --git a/spark/common/src/test/scala/org/apache/spark/sql/sedona_sql/io/stac/StacPartitionReaderTest.scala b/spark/common/src/test/scala/org/apache/spark/sql/sedona_sql/io/stac/StacPartitionReaderTest.scala new file mode 100644 index 0000000000..fc6f8dcfdc --- /dev/null +++ b/spark/common/src/test/scala/org/apache/spark/sql/sedona_sql/io/stac/StacPartitionReaderTest.scala @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.stac + +import org.apache.sedona.sql.TestBaseScala +import org.apache.spark.sql.catalyst.InternalRow + +import scala.jdk.CollectionConverters._ + +class StacPartitionReaderTest extends TestBaseScala { + + val TEST_DATA_FOLDER: String = + System.getProperty("user.dir") + "/src/test/resources/datasource_stac" + val JSON_STAC_ITEM_SIMPLE: String = s"file://$TEST_DATA_FOLDER/simple-item.json" + val JSON_STAC_ITEM_CORE: String = s"file://$TEST_DATA_FOLDER/core-item.json" + val JSON_STAC_ITEM_EXTENDED: String = s"file://$TEST_DATA_FOLDER/extended-item.json" + val JSON_STAC_ITEM_FEATURES: String = s"file://$TEST_DATA_FOLDER/collection-items.json" + val HTTPS_STAC_ITEM_FEATURES: String = + "https://earth-search.aws.element84.com/v1/collections/sentinel-2-pre-c1-l2a/items" + + it("StacPartitionReader should read feature files from local files") { + val jsonFiles = + Seq(JSON_STAC_ITEM_SIMPLE, JSON_STAC_ITEM_CORE, JSON_STAC_ITEM_EXTENDED).toArray + val partition = StacPartition(0, jsonFiles, Map.empty[String, String].asJava) + val reader = + new StacPartitionReader( + partition, + StacTable.SCHEMA_V1_1_0, + Map.empty[String, String], + None, + None) + + assert(reader.next()) + (1 to 3).foreach { i => + val row: InternalRow = reader.get() + assert(row != null) + assert(reader.next() == (i < 3)) + } + + reader.close() + } + + it("StacPartitionReader should read features collection file from local files") { + val jsonFiles = Seq(JSON_STAC_ITEM_FEATURES).toArray + val partition = StacPartition(0, jsonFiles, Map.empty[String, String].asJava) + val reader = + new StacPartitionReader( + partition, + StacTable.SCHEMA_V1_1_0, + Map.empty[String, String], + None, + None) + + assert(reader.next()) + (1 to 10).foreach { i => + val row: InternalRow = reader.get() + assert(row != null) + assert(reader.next() == (i < 10)) + } + + reader.close() + } + + it("StacPartitionReader should read features collection file from https endpoint") { + val jsonFiles = Seq(HTTPS_STAC_ITEM_FEATURES).toArray + val partition = StacPartition(0, jsonFiles, Map.empty[String, String].asJava) + val reader = + new StacPartitionReader( + partition, + StacTable.SCHEMA_V1_1_0, + Map.empty[String, String], + None, + None) + + assert(reader.next()) + (1 to 10).foreach { i => + val row: InternalRow = reader.get() + assert(row != null) + assert(reader.next() == (i < 10)) + } + + reader.close() + } +} diff --git a/spark/common/src/test/scala/org/apache/spark/sql/sedona_sql/io/stac/StacTableTest.scala b/spark/common/src/test/scala/org/apache/spark/sql/sedona_sql/io/stac/StacTableTest.scala new file mode 100644 index 0000000000..eca7768733 --- /dev/null +++ b/spark/common/src/test/scala/org/apache/spark/sql/sedona_sql/io/stac/StacTableTest.scala @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.stac + +import org.apache.spark.sql.sedona_sql.io.stac.StacTable.{SCHEMA_GEOPARQUET, addAssetStruct, addAssetsStruct} +import org.apache.spark.sql.types.{ArrayType, MapType, StringType, StructField, StructType} +import org.scalatest.funsuite.AnyFunSuite + +class StacTableTest extends AnyFunSuite { + + test("addAssetStruct should add a new asset to an existing assets struct") { + val initialSchema = StructType( + Seq( + StructField("id", StringType, nullable = false), + StructField( + "assets", + StructType(Seq(StructField( + "image", + StructType(Seq( + StructField("href", StringType, nullable = true), + StructField("roles", ArrayType(StringType), nullable = true), + StructField("title", StringType, nullable = true), + StructField("type", StringType, nullable = true))), + nullable = true))), + nullable = true))) + + val updatedSchema = addAssetStruct(initialSchema, "thumbnail") + + assert(updatedSchema.fieldNames.contains("assets")) + val assetsField = updatedSchema("assets").dataType.asInstanceOf[StructType] + assert(assetsField.fieldNames.contains("thumbnail")) + } + + test("addAssetStruct should create assets struct if it doesn't exist") { + val initialSchema = StructType(Seq(StructField("id", StringType, nullable = false))) + + val updatedSchema1 = addAssetStruct(initialSchema, "image") + val updatedSchema2 = addAssetStruct(updatedSchema1, "rast") + + assert(updatedSchema2.fieldNames.contains("assets")) + val assetsField = updatedSchema2("assets").dataType.asInstanceOf[StructType] + assert(assetsField.fieldNames.contains("image")) + assert(assetsField.fieldNames.contains("rast")) + } + + test("addAssetStruct should not modify other fields") { + val initialSchema = SCHEMA_GEOPARQUET + val updatedSchema = addAssetsStruct(initialSchema, Array("thumbnail", "preview")) + + assert(updatedSchema.fieldNames.contains("id")) + assert(updatedSchema.fieldNames.contains("stac_version")) + assert(updatedSchema.fieldNames.contains("stac_extensions")) + assert(updatedSchema.fieldNames.contains("bbox")) + assert(updatedSchema.fieldNames.contains("geometry")) + assert(updatedSchema.fieldNames.contains("assets")) + } +} diff --git a/spark/common/src/test/scala/org/apache/spark/sql/sedona_sql/io/stac/StacUtilsTest.scala b/spark/common/src/test/scala/org/apache/spark/sql/sedona_sql/io/stac/StacUtilsTest.scala new file mode 100644 index 0000000000..75542c760d --- /dev/null +++ b/spark/common/src/test/scala/org/apache/spark/sql/sedona_sql/io/stac/StacUtilsTest.scala @@ -0,0 +1,594 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.spark.sql.sedona_sql.io.stac + +import com.fasterxml.jackson.databind.{JsonNode, ObjectMapper} +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.sedona_sql.io.stac.StacUtils.getNumPartitions +import org.scalatest.funsuite.AnyFunSuite + +import java.io.{File, PrintWriter} +import scala.io.Source +import scala.jdk.CollectionConverters._ + +class StacUtilsTest extends AnyFunSuite { + + test("getStacCollectionBasePath should return base URL for HTTP URL") { + val opts = Map("path" -> "https://service_url/collections/collection.json") + val result = StacUtils.getStacCollectionBasePath(opts) + assert(result == "https://service_url/") + } + + test("getStacCollectionBasePath should return base URL for HTTPS URL") { + val opts = Map("path" -> "https://service_url/collections/collection.json") + val result = StacUtils.getStacCollectionBasePath(opts) + assert(result == "https://service_url/") + } + + test("getStacCollectionBasePath should return base path for file URL") { + val opts = Map("path" -> "file:///usr/opt/collection.json") + val result = StacUtils.getStacCollectionBasePath(opts) + assert(result == "file:///usr/opt/") + } + + test("getStacCollectionBasePath should return base path for local file path") { + val opts = Map("path" -> "/usr/opt/collection.json") + val result = StacUtils.getStacCollectionBasePath(opts) + assert(result == "file:///usr/opt/") + } + + test( + "getStacCollectionBasePath should throw IllegalArgumentException if neither url nor service is provided") { + val opts = Map.empty[String, String] + assertThrows[IllegalArgumentException] { + StacUtils.getStacCollectionBasePath(opts) + } + } + + test( + "getStacCollectionBasePath should throw IllegalArgumentException for invalid URL or file path") { + val opts = Map("path" -> "invalid_path") + assertThrows[IllegalArgumentException] { + StacUtils.getStacCollectionBasePath(opts) + } + } + + test("getNumPartitions should return numPartitions if it is greater than 0") { + assert( + getNumPartitions( + itemCount = 100, + numPartitions = 5, + maxPartitionItemFiles = 10, + defaultParallelism = 4) == 5) + } + + test( + "getNumPartitions should calculate partitions based on maxPartitionItemFiles and defaultParallelism") { + assert( + getNumPartitions( + itemCount = 100, + numPartitions = 0, + maxPartitionItemFiles = 10, + defaultParallelism = 4) == 10) + } + + test( + "getNumPartitions should handle case when maxPartitionItemFiles is less than sum of files / defaultParallelism") { + assert( + getNumPartitions( + itemCount = 100, + numPartitions = 0, + maxPartitionItemFiles = 5, + defaultParallelism = 4) == 20) + } + + test("getNumPartitions should handle case when maxPartitionItemFiles is 0") { + assert( + getNumPartitions( + itemCount = 100, + numPartitions = 0, + maxPartitionItemFiles = 0, + defaultParallelism = 4) == 4) + } + + test("getNumPartitions should handle case when defaultParallelism is 1") { + assert( + getNumPartitions( + itemCount = 100, + numPartitions = 0, + maxPartitionItemFiles = 10, + defaultParallelism = 1) == 10) + } + + test("getNumPartitions should return at least 1 partition") { + assert( + getNumPartitions( + itemCount = 0, + numPartitions = 0, + maxPartitionItemFiles = 10, + defaultParallelism = 4) == 1) + } + + test( + "processStacCollection should process STAC collection from JSON string and save features to output file") { + val spark = SparkSession.builder().master("local").appName("StacUtilsTest").getOrCreate() + val hadoopConf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem.get(hadoopConf) + + // Create a temporary STAC collection JSON file + val stacCollectionJson = + """ + |{ + | "stac_version": "1.0.0", + | "id": "sample-collection", + | "description": "A sample STAC collection", + | "links": [ + | {"rel": "item", "href": "file:///tmp/item1.json"}, + | {"rel": "item", "href": "file:///tmp/item2.json"} + | ] + |} + """.stripMargin + val stacCollectionPath = new Path("/tmp/collection.json") + val stacCollectionWriter = new PrintWriter(new File(stacCollectionPath.toString)) + stacCollectionWriter.write(stacCollectionJson) + stacCollectionWriter.close() + + // Create temporary item JSON files + val item1Json = + """ + |{ + | "stac_version": "1.1.0", + | "stac_extensions": [], + | "type": "Feature", + | "id": "20201211_223832_CS2_item1", + | "bbox": [ + | 172.91173669923782, + | 1.3438851951615003, + | 172.95469614953714, + | 1.3690476620161975 + | ], + | "geometry": { + | "type": "Polygon", + | "coordinates": [ + | [ + | [ + | 172.91173669923782, + | 1.3438851951615003 + | ], + | [ + | 172.95469614953714, + | 1.3438851951615003 + | ], + | [ + | 172.95469614953714, + | 1.3690476620161975 + | ], + | [ + | 172.91173669923782, + | 1.3690476620161975 + | ], + | [ + | 172.91173669923782, + | 1.3438851951615003 + | ] + | ] + | ] + | }, + | "properties": { + | "title": "Item 1", + | "description": "A sample STAC Item 1 that includes examples of all common metadata", + | "datetime": null, + | "start_datetime": "2020-12-11T22:38:32.125Z", + | "end_datetime": "2020-12-11T22:38:32.327Z", + | "created": "2020-12-12T01:48:13.725Z", + | "updated": "2020-12-12T01:48:13.725Z", + | "platform": "cool_sat1", + | "instruments": [ + | "cool_sensor_v1" + | ], + | "constellation": "ion", + | "mission": "collection 5624", + | "gsd": 0.512 + | }, + | "collection": "simple-collection", + | "links": [ + | { + | "rel": "collection", + | "href": "./collection.json", + | "type": "application/json", + | "title": "Simple Example Collection" + | }, + | { + | "rel": "root", + | "href": "./collection.json", + | "type": "application/json", + | "title": "Simple Example Collection" + | }, + | { + | "rel": "parent", + | "href": "./collection.json", + | "type": "application/json", + | "title": "Simple Example Collection" + | }, + | { + | "rel": "alternate", + | "type": "text/html", + | "href": "https://remotedata.io/catalog/20201211_223832_CS2_item1/index.html", + | "title": "HTML version of this STAC Item" + | } + | ], + | "assets": { + | "analytic": { + | "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2_item1_analytic.tif", + | "type": "image/tiff; application=geotiff; profile=cloud-optimized", + | "title": "4-Band Analytic", + | "roles": [ + | "data" + | ] + | }, + | "thumbnail": { + | "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2_item1.jpg", + | "title": "Thumbnail", + | "type": "image/png", + | "roles": [ + | "thumbnail" + | ] + | }, + | "visual": { + | "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2_item1.tif", + | "type": "image/tiff; application=geotiff; profile=cloud-optimized", + | "title": "3-Band Visual", + | "roles": [ + | "visual" + | ] + | }, + | "udm": { + | "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2_item1_analytic_udm.tif", + | "title": "Unusable Data Mask", + | "type": "image/tiff; application=geotiff" + | }, + | "json-metadata": { + | "href": "https://remotedata.io/catalog/20201211_223832_CS2_item1/extended-metadata.json", + | "title": "Extended Metadata", + | "type": "application/json", + | "roles": [ + | "metadata" + | ] + | }, + | "ephemeris": { + | "href": "https://cool-sat.com/catalog/20201211_223832_CS2_item1/20201211_223832_CS2_item1.EPH", + | "title": "Satellite Ephemeris Metadata" + | } + | } + |} + """.stripMargin + val item1Path = new Path("/tmp/item1.json") + val item1Writer = new PrintWriter(new File(item1Path.toString)) + item1Writer.write(item1Json) + item1Writer.close() + + val item2Json = + """ + |{ + | "stac_version": "1.1.0", + | "stac_extensions": [], + | "type": "Feature", + | "id": "20201211_223832_CS2_item2", + | "bbox": [ + | 173.91173669923782, + | 2.3438851951615003, + | 173.95469614953714, + | 2.3690476620161975 + | ], + | "geometry": { + | "type": "Polygon", + | "coordinates": [ + | [ + | [ + | 173.91173669923782, + | 2.3438851951615003 + | ], + | [ + | 173.95469614953714, + | 2.3438851951615003 + | ], + | [ + | 173.95469614953714, + | 2.3690476620161975 + | ], + | [ + | 173.91173669923782, + | 2.3690476620161975 + | ], + | [ + | 173.91173669923782, + | 2.3438851951615003 + | ] + | ] + | ] + | }, + | "properties": { + | "title": "Item 2", + | "description": "A different sample STAC Item 2 that includes examples of all common metadata", + | "datetime": null, + | "start_datetime": "2020-12-12T22:38:32.125Z", + | "end_datetime": "2020-12-12T22:38:32.327Z", + | "created": "2020-12-13T01:48:13.725Z", + | "updated": "2020-12-13T01:48:13.725Z", + | "platform": "cool_sat2", + | "instruments": [ + | "cool_sensor_v2" + | ], + | "constellation": "ion", + | "mission": "collection 5625", + | "gsd": 0.512 + | }, + | "collection": "simple-collection", + | "links": [ + | { + | "rel": "collection", + | "href": "./collection.json", + | "type": "application/json", + | "title": "Simple Example Collection" + | }, + | { + | "rel": "root", + | "href": "./collection.json", + | "type": "application/json", + | "title": "Simple Example Collection" + | }, + | { + | "rel": "parent", + | "href": "./collection.json", + | "type": "application/json", + | "title": "Simple Example Collection" + | }, + | { + | "rel": "alternate", + | "type": "text/html", + | "href": "https://remotedata.io/catalog/20201211_223832_CS2_item2/index.html", + | "title": "HTML version of this STAC Item" + | } + | ], + | "assets": { + | "analytic": { + | "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2_item2_analytic.tif", + | "type": "image/tiff; application=geotiff; profile=cloud-optimized", + | "title": "4-Band Analytic", + | "roles": [ + | "data" + | ] + | }, + | "thumbnail": { + | "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2_item2.jpg", + | "title": "Thumbnail", + | "type": "image/png", + | "roles": [ + | "thumbnail" + | ] + | }, + | "visual": { + | "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2_item2.tif", + | "type": "image/tiff; application=geotiff; profile=cloud-optimized", + | "title": "3-Band Visual", + | "roles": [ + | "visual" + | ] + | }, + | "udm": { + | "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2_item2_analytic_udm.tif", + | "title": "Unusable Data Mask", + | "type": "image/tiff; application=geotiff" + | }, + | "json-metadata": { + | "href": "https://remotedata.io/catalog/20201211_223832_CS2_item2/extended-metadata.json", + | "title": "Extended Metadata", + | "type": "application/json", + | "roles": [ + | "metadata" + | ] + | }, + | "ephemeris": { + | "href": "https://cool-sat.com/catalog/20201211_223832_CS2_item2/20201211_223832_CS2_item2.EPH", + | "title": "Satellite Ephemeris Metadata" + | } + | } + |} + """.stripMargin + val item2Path = new Path("/tmp/item2.json") + val item2Writer = new PrintWriter(new File(item2Path.toString)) + item2Writer.write(item2Json) + item2Writer.close() + + // Load the STAC collection JSON + val opts = Map("path" -> "file:///tmp/collection.json") + val stacCollectionJsonString = StacUtils.loadStacCollectionToJson(opts) + val outputPath = "/tmp/output.json" + + // Call the function to process the STAC collection + saveStacCollection(stacCollectionJsonString, outputPath) + + // Verify the output file + val outputFile = new File(outputPath) + assert(outputFile.exists()) + + val outputContent = Source.fromFile(outputFile).getLines().mkString("\n") + assert(outputContent.contains("item1")) + assert(outputContent.contains("item2")) + + // Clean up temporary files + fs.delete(stacCollectionPath, false) + fs.delete(item1Path, false) + fs.delete(item2Path, false) + outputFile.delete() + } + + test( + "processStacCollection should process STAC collection with mixed 'item' and 'items' rels and save features to output file") { + val spark = SparkSession.builder().master("local").appName("StacUtilsTest").getOrCreate() + val hadoopConf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem.get(hadoopConf) + + // Create a temporary STAC collection JSON file + val stacCollectionJson = + """ + |{ + | "stac_version": "1.0.0", + | "id": "sample-collection", + | "description": "A sample STAC collection", + | "links": [ + | {"rel": "item", "href": "file:///tmp/item1.json"}, + | {"rel": "items", "href": "file:///tmp/items.json"} + | ] + |} + """.stripMargin + val stacCollectionPath = new Path("/tmp/collection.json") + val stacCollectionWriter = new PrintWriter(new File(stacCollectionPath.toString)) + stacCollectionWriter.write(stacCollectionJson) + stacCollectionWriter.close() + + // Create temporary item JSON files + val item1Json = + """ + |{ + | "type": "Feature", + | "id": "item1", + | "geometry": { + | "type": "Point", + | "coordinates": [100.0, 0.0] + | }, + | "properties": { + | "title": "Item 1" + | } + |} + """.stripMargin + val item1Path = new Path("/tmp/item1.json") + val item1Writer = new PrintWriter(new File(item1Path.toString)) + item1Writer.write(item1Json) + item1Writer.close() + + val itemsJson = + """ + |{ + | "type": "FeatureCollection", + | "features": [ + | { + | "type": "Feature", + | "id": "item2", + | "geometry": { + | "type": "Point", + | "coordinates": [101.0, 1.0] + | }, + | "properties": { + | "title": "Item 2" + | } + | }, + | { + | "type": "Feature", + | "id": "item3", + | "geometry": { + | "type": "Point", + | "coordinates": [102.0, 2.0] + | }, + | "properties": { + | "title": "Item 3" + | } + | } + | ] + |} + """.stripMargin + val itemsPath = new Path("/tmp/items.json") + val itemsWriter = new PrintWriter(new File(itemsPath.toString)) + itemsWriter.write(itemsJson) + itemsWriter.close() + + // Load the STAC collection JSON + val opts = Map("path" -> "file:///tmp/collection.json") + val stacCollectionJsonString = StacUtils.loadStacCollectionToJson(opts) + val outputPath = "/tmp/output.json" + + // Call the function to process the STAC collection + saveStacCollection(stacCollectionJsonString, outputPath) + + // Verify the output file + val outputFile = new File(outputPath) + assert(outputFile.exists()) + + val outputContent = Source.fromFile(outputFile).getLines().mkString("\n") + assert(outputContent.contains("item1")) + assert(outputContent.contains("item2")) + assert(outputContent.contains("item3")) + + // Clean up temporary files + fs.delete(stacCollectionPath, false) + fs.delete(item1Path, false) + fs.delete(itemsPath, false) + outputFile.delete() + } + + // Function to process STAC collection + def saveStacCollection(stacCollectionJson: String, outputPath: String): Unit = { + // Create the ObjectMapper + val mapper = new ObjectMapper() + mapper.registerModule(DefaultScalaModule) + + // Parse the STAC collection JSON + val collection: JsonNode = mapper.readTree(stacCollectionJson) + + // Extract item and items links + val itemLinks = collection.get("links").elements().asScala.filter { link => + val rel = link.get("rel").asText() + rel == "item" || rel == "items" + } + + // Open a writer for the output multiline JSON file + val writer = new PrintWriter(new File(outputPath)) + + try { + // Iterate over each item link + itemLinks.foreach { link => + val itemUrl = link.get("href").asText() + + // Fetch the item JSON + val itemJson = Source.fromURL(itemUrl).mkString + + // Parse the item JSON + val itemCollection: JsonNode = mapper.readTree(itemJson) + + // Check if the link is of type "items" + if (link.get("rel").asText() == "items") { + // Iterate over each feature in the item collection + val features = itemCollection.get("features").elements().asScala + features.foreach { feature => + // Write each feature JSON as a single line in the output file + writer.println(mapper.writeValueAsString(feature)) + } + } else { + // Write the item JSON as a single line in the output file + writer.println(mapper.writeValueAsString(itemCollection)) + } + } + } finally { + // Close the writer + writer.close() + } + } +} diff --git a/spark/spark-3.3/src/main/scala/org/apache/sedona/sql/parser/SedonaSqlParser.scala b/spark/spark-3.3/src/main/scala/org/apache/sedona/sql/parser/SedonaSqlParser.scala index 6c70419122..56c27ba76b 100644 --- a/spark/spark-3.3/src/main/scala/org/apache/sedona/sql/parser/SedonaSqlParser.scala +++ b/spark/spark-3.3/src/main/scala/org/apache/sedona/sql/parser/SedonaSqlParser.scala @@ -35,12 +35,8 @@ class SedonaSqlParser(delegate: ParserInterface) extends SparkSqlParser { override def parsePlan(sqlText: String): LogicalPlan = try { parse(sqlText) { parser => - parserBuilder.visit(parser.singleStatement()) match { - case plan: LogicalPlan => plan - case _ => - delegate.parsePlan(sqlText) - } - } + parserBuilder.visit(parser.singleStatement()) + }.asInstanceOf[LogicalPlan] } catch { case _: Exception => delegate.parsePlan(sqlText) diff --git a/spark/spark-3.3/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala b/spark/spark-3.3/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala index 1fe2faa2e0..a101206fbf 100644 --- a/spark/spark-3.3/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala +++ b/spark/spark-3.3/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala @@ -20,6 +20,8 @@ package org.apache.spark.sql.execution.datasources.v2.geoparquet.metadata import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path +import org.apache.parquet.ParquetReadOptions +import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.ParquetFileReader import org.apache.parquet.hadoop.util.HadoopInputFile import org.apache.spark.broadcast.Broadcast @@ -67,11 +69,11 @@ object GeoParquetMetadataPartitionReaderFactory { partitionedFile: PartitionedFile, readDataSchema: StructType): Iterator[InternalRow] = { val filePath = partitionedFile.filePath - val metadata = ParquetFileReader - .open(HadoopInputFile.fromPath(new Path(filePath), configuration)) - .getFooter - .getFileMetaData - .getKeyValueMetaData + + val footer = ParquetFileReader + .readFooter(configuration, new Path(filePath), ParquetMetadataConverter.NO_FILTER) + + val metadata = footer.getFileMetaData.getKeyValueMetaData val row = GeoParquetMetaData.parseKeyValueMetaData(metadata) match { case Some(geo) => val geoColumnsMap = geo.columns.map { case (columnName, columnMetadata) => diff --git a/spark/spark-3.3/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala b/spark/spark-3.3/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala index 72680aacd4..6f873d0a08 100644 --- a/spark/spark-3.3/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala +++ b/spark/spark-3.3/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala @@ -44,14 +44,29 @@ class SQLSyntaxTestScala extends TestBaseScala with TableDrivenPropertyChecks { it( "should be able to create a regular table with geometry column should work without a workaround") { - sparkSession.sql("CREATE TABLE T_TEST_EXPLICIT_GEOMETRY (GEO_COL GEOMETRY)") - sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY") should be(true) + try { + sparkSession.sql("CREATE TABLE T_TEST_EXPLICIT_GEOMETRY (GEO_COL GEOMETRY)") + sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY") should be(true) + sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true") + } catch { + case ex: Exception => + ex.getClass.getName.endsWith("ParseException") should be(true) + sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false") + } } it( "should be able to create a regular table with regular and geometry column should work without a workaround") { - sparkSession.sql("CREATE TABLE T_TEST_EXPLICIT_GEOMETRY_2 (INT_COL INT, GEO_COL GEOMETRY)") - sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY_2") should be(true) + try { + sparkSession.sql( + "CREATE TABLE T_TEST_EXPLICIT_GEOMETRY_2 (INT_COL INT, GEO_COL GEOMETRY)") + sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY_2") should be(true) + sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true") + } catch { + case ex: Exception => + ex.getClass.getName.endsWith("ParseException") should be(true) + sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false") + } } } } diff --git a/spark/spark-3.3/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala b/spark/spark-3.3/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala index f629648b29..8d13f6138d 100644 --- a/spark/spark-3.3/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala +++ b/spark/spark-3.3/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala @@ -23,6 +23,8 @@ import org.apache.sedona.spark.SedonaContext import org.apache.spark.sql.DataFrame import org.scalatest.{BeforeAndAfterAll, FunSpec} +import java.util.concurrent.ThreadLocalRandom + trait TestBaseScala extends FunSpec with BeforeAndAfterAll { Logger.getRootLogger().setLevel(Level.WARN) Logger.getLogger("org.apache").setLevel(Level.WARN) @@ -30,6 +32,7 @@ trait TestBaseScala extends FunSpec with BeforeAndAfterAll { Logger.getLogger("akka").setLevel(Level.WARN) Logger.getLogger("org.apache.sedona.core").setLevel(Level.WARN) + val keyParserExtension = "spark.sedona.enableParserExtension" val warehouseLocation = System.getProperty("user.dir") + "/target/" val sparkSession = SedonaContext .builder() @@ -38,6 +41,8 @@ trait TestBaseScala extends FunSpec with BeforeAndAfterAll { .config("spark.sql.warehouse.dir", warehouseLocation) .config("sedona.join.autoBroadcastJoinThreshold", "-1") .config("spark.sql.session.timeZone", "UTC") + .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions") + .config(keyParserExtension, ThreadLocalRandom.current().nextBoolean()) .getOrCreate() val sparkSessionMinio = SedonaContext diff --git a/spark/spark-3.4/src/main/scala/org/apache/sedona/sql/parser/SedonaSqlParser.scala b/spark/spark-3.4/src/main/scala/org/apache/sedona/sql/parser/SedonaSqlParser.scala index 6c70419122..56c27ba76b 100644 --- a/spark/spark-3.4/src/main/scala/org/apache/sedona/sql/parser/SedonaSqlParser.scala +++ b/spark/spark-3.4/src/main/scala/org/apache/sedona/sql/parser/SedonaSqlParser.scala @@ -35,12 +35,8 @@ class SedonaSqlParser(delegate: ParserInterface) extends SparkSqlParser { override def parsePlan(sqlText: String): LogicalPlan = try { parse(sqlText) { parser => - parserBuilder.visit(parser.singleStatement()) match { - case plan: LogicalPlan => plan - case _ => - delegate.parsePlan(sqlText) - } - } + parserBuilder.visit(parser.singleStatement()) + }.asInstanceOf[LogicalPlan] } catch { case _: Exception => delegate.parsePlan(sqlText) diff --git a/spark/spark-3.4/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala b/spark/spark-3.4/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala index 2a5e70624c..e4ca35992b 100644 --- a/spark/spark-3.4/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala +++ b/spark/spark-3.4/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.v2.geoparquet.metadata import org.apache.hadoop.conf.Configuration +import org.apache.parquet.ParquetReadOptions import org.apache.parquet.hadoop.ParquetFileReader import org.apache.parquet.hadoop.util.HadoopInputFile import org.apache.spark.broadcast.Broadcast @@ -66,12 +67,14 @@ object GeoParquetMetadataPartitionReaderFactory { configuration: Configuration, partitionedFile: PartitionedFile, readDataSchema: StructType): Iterator[InternalRow] = { + val inputFile = HadoopInputFile.fromPath(partitionedFile.toPath, configuration) + val inputStream = inputFile.newStream() + + val footer = ParquetFileReader + .readFooter(inputFile, ParquetReadOptions.builder().build(), inputStream) + val filePath = partitionedFile.toPath.toString - val metadata = ParquetFileReader - .open(HadoopInputFile.fromPath(partitionedFile.toPath, configuration)) - .getFooter - .getFileMetaData - .getKeyValueMetaData + val metadata = footer.getFileMetaData.getKeyValueMetaData val row = GeoParquetMetaData.parseKeyValueMetaData(metadata) match { case Some(geo) => val geoColumnsMap = geo.columns.map { case (columnName, columnMetadata) => diff --git a/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala b/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala index 72680aacd4..6f873d0a08 100644 --- a/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala +++ b/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala @@ -44,14 +44,29 @@ class SQLSyntaxTestScala extends TestBaseScala with TableDrivenPropertyChecks { it( "should be able to create a regular table with geometry column should work without a workaround") { - sparkSession.sql("CREATE TABLE T_TEST_EXPLICIT_GEOMETRY (GEO_COL GEOMETRY)") - sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY") should be(true) + try { + sparkSession.sql("CREATE TABLE T_TEST_EXPLICIT_GEOMETRY (GEO_COL GEOMETRY)") + sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY") should be(true) + sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true") + } catch { + case ex: Exception => + ex.getClass.getName.endsWith("ParseException") should be(true) + sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false") + } } it( "should be able to create a regular table with regular and geometry column should work without a workaround") { - sparkSession.sql("CREATE TABLE T_TEST_EXPLICIT_GEOMETRY_2 (INT_COL INT, GEO_COL GEOMETRY)") - sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY_2") should be(true) + try { + sparkSession.sql( + "CREATE TABLE T_TEST_EXPLICIT_GEOMETRY_2 (INT_COL INT, GEO_COL GEOMETRY)") + sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY_2") should be(true) + sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true") + } catch { + case ex: Exception => + ex.getClass.getName.endsWith("ParseException") should be(true) + sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false") + } } } } diff --git a/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala b/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala index 34746d0b28..ae1ed5d091 100644 --- a/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala +++ b/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala @@ -23,6 +23,8 @@ import org.apache.sedona.spark.SedonaContext import org.apache.spark.sql.DataFrame import org.scalatest.{BeforeAndAfterAll, FunSpec} +import java.util.concurrent.ThreadLocalRandom + trait TestBaseScala extends FunSpec with BeforeAndAfterAll { Logger.getRootLogger().setLevel(Level.WARN) Logger.getLogger("org.apache").setLevel(Level.WARN) @@ -30,6 +32,7 @@ trait TestBaseScala extends FunSpec with BeforeAndAfterAll { Logger.getLogger("akka").setLevel(Level.WARN) Logger.getLogger("org.apache.sedona.core").setLevel(Level.WARN) + val keyParserExtension = "spark.sedona.enableParserExtension" val warehouseLocation = System.getProperty("user.dir") + "/target/" val sparkSession = SedonaContext .builder() @@ -38,6 +41,8 @@ trait TestBaseScala extends FunSpec with BeforeAndAfterAll { .config("spark.sql.warehouse.dir", warehouseLocation) // We need to be explicit about broadcasting in tests. .config("sedona.join.autoBroadcastJoinThreshold", "-1") + .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions") + .config(keyParserExtension, ThreadLocalRandom.current().nextBoolean()) .getOrCreate() val sparkSessionMinio = SedonaContext diff --git a/spark/spark-3.5/src/main/scala/org/apache/sedona/sql/datasources/geopackage/GeoPackageScanBuilder.scala b/spark/spark-3.5/src/main/scala/org/apache/sedona/sql/datasources/geopackage/GeoPackageScanBuilder.scala index a9674395b4..829bd9c220 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/sedona/sql/datasources/geopackage/GeoPackageScanBuilder.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/sedona/sql/datasources/geopackage/GeoPackageScanBuilder.scala @@ -37,13 +37,11 @@ class GeoPackageScanBuilder( extends FileScanBuilder(sparkSession, fileIndex, dataSchema) { override def build(): Scan = { - val paths = fileIndex.allFiles().map(_.getPath.toString) - val fileIndexAdjusted = if (loadOptions.showMetadata) new InMemoryFileIndex( sparkSession, - paths.slice(0, 1).map(new org.apache.hadoop.fs.Path(_)), + fileIndex.inputFiles.slice(0, 1).map(new org.apache.hadoop.fs.Path(_)), options.asCaseSensitiveMap.asScala.toMap, userDefinedSchema) else fileIndex diff --git a/spark/spark-3.5/src/main/scala/org/apache/sedona/sql/parser/SedonaSqlParser.scala b/spark/spark-3.5/src/main/scala/org/apache/sedona/sql/parser/SedonaSqlParser.scala index 6c70419122..56c27ba76b 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/sedona/sql/parser/SedonaSqlParser.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/sedona/sql/parser/SedonaSqlParser.scala @@ -35,12 +35,8 @@ class SedonaSqlParser(delegate: ParserInterface) extends SparkSqlParser { override def parsePlan(sqlText: String): LogicalPlan = try { parse(sqlText) { parser => - parserBuilder.visit(parser.singleStatement()) match { - case plan: LogicalPlan => plan - case _ => - delegate.parsePlan(sqlText) - } - } + parserBuilder.visit(parser.singleStatement()) + }.asInstanceOf[LogicalPlan] } catch { case _: Exception => delegate.parsePlan(sqlText) diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala index 2a5e70624c..e1234e79d8 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.v2.geoparquet.metadata import org.apache.hadoop.conf.Configuration +import org.apache.parquet.ParquetReadOptions import org.apache.parquet.hadoop.ParquetFileReader import org.apache.parquet.hadoop.util.HadoopInputFile import org.apache.spark.broadcast.Broadcast @@ -66,12 +67,15 @@ object GeoParquetMetadataPartitionReaderFactory { configuration: Configuration, partitionedFile: PartitionedFile, readDataSchema: StructType): Iterator[InternalRow] = { + + val inputFile = HadoopInputFile.fromPath(partitionedFile.toPath, configuration) + val inputStream = inputFile.newStream() + + val footer = ParquetFileReader + .readFooter(inputFile, ParquetReadOptions.builder().build(), inputStream) + val filePath = partitionedFile.toPath.toString - val metadata = ParquetFileReader - .open(HadoopInputFile.fromPath(partitionedFile.toPath, configuration)) - .getFooter - .getFileMetaData - .getKeyValueMetaData + val metadata = footer.getFileMetaData.getKeyValueMetaData val row = GeoParquetMetaData.parseKeyValueMetaData(metadata) match { case Some(geo) => val geoColumnsMap = geo.columns.map { case (columnName, columnMetadata) => diff --git a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala index 72680aacd4..6f873d0a08 100644 --- a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala +++ b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/SQLSyntaxTestScala.scala @@ -44,14 +44,29 @@ class SQLSyntaxTestScala extends TestBaseScala with TableDrivenPropertyChecks { it( "should be able to create a regular table with geometry column should work without a workaround") { - sparkSession.sql("CREATE TABLE T_TEST_EXPLICIT_GEOMETRY (GEO_COL GEOMETRY)") - sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY") should be(true) + try { + sparkSession.sql("CREATE TABLE T_TEST_EXPLICIT_GEOMETRY (GEO_COL GEOMETRY)") + sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY") should be(true) + sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true") + } catch { + case ex: Exception => + ex.getClass.getName.endsWith("ParseException") should be(true) + sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false") + } } it( "should be able to create a regular table with regular and geometry column should work without a workaround") { - sparkSession.sql("CREATE TABLE T_TEST_EXPLICIT_GEOMETRY_2 (INT_COL INT, GEO_COL GEOMETRY)") - sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY_2") should be(true) + try { + sparkSession.sql( + "CREATE TABLE T_TEST_EXPLICIT_GEOMETRY_2 (INT_COL INT, GEO_COL GEOMETRY)") + sparkSession.catalog.tableExists("T_TEST_EXPLICIT_GEOMETRY_2") should be(true) + sparkSession.sparkContext.getConf.get(keyParserExtension) should be("true") + } catch { + case ex: Exception => + ex.getClass.getName.endsWith("ParseException") should be(true) + sparkSession.sparkContext.getConf.get(keyParserExtension) should be("false") + } } } } diff --git a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala index 34746d0b28..ae1ed5d091 100644 --- a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala +++ b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/TestBaseScala.scala @@ -23,6 +23,8 @@ import org.apache.sedona.spark.SedonaContext import org.apache.spark.sql.DataFrame import org.scalatest.{BeforeAndAfterAll, FunSpec} +import java.util.concurrent.ThreadLocalRandom + trait TestBaseScala extends FunSpec with BeforeAndAfterAll { Logger.getRootLogger().setLevel(Level.WARN) Logger.getLogger("org.apache").setLevel(Level.WARN) @@ -30,6 +32,7 @@ trait TestBaseScala extends FunSpec with BeforeAndAfterAll { Logger.getLogger("akka").setLevel(Level.WARN) Logger.getLogger("org.apache.sedona.core").setLevel(Level.WARN) + val keyParserExtension = "spark.sedona.enableParserExtension" val warehouseLocation = System.getProperty("user.dir") + "/target/" val sparkSession = SedonaContext .builder() @@ -38,6 +41,8 @@ trait TestBaseScala extends FunSpec with BeforeAndAfterAll { .config("spark.sql.warehouse.dir", warehouseLocation) // We need to be explicit about broadcasting in tests. .config("sedona.join.autoBroadcastJoinThreshold", "-1") + .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions") + .config(keyParserExtension, ThreadLocalRandom.current().nextBoolean()) .getOrCreate() val sparkSessionMinio = SedonaContext