Source code for niche_vlaanderen.validation

import logging
import warnings
from collections import defaultdict
from pathlib import Path

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    warnings.filterwarnings("ignore", category=UserWarning)
    import geopandas as gpd

import numpy as np
import pandas as pd

from niche_vlaanderen.niche import Niche
from niche_vlaanderen.codetables import package_resource

logger = logging.getLogger(__name__)


class NicheValidationException(Exception):
    msg = "Error using Niche Overlay"



[docs]
class NicheValidation(object):
    """Creates a new NicheValidation object

    Overlays a vegetation map with niche in order to
    obtain the accuracy of the model.

    Parameters:
        niche: niche_vlaanderen.Niche
            Niche object containing predicted variables types according to niche.
            The model must have run prior to the overlay
        map: Path
            Path to a file containing the vegetation map in one of the formats supported
            by fiona, eg shape.
            must contain these attributes:
            * HABx: vegetation type
            * pHABx: proportion of the vegetation type

            different vegetation types can be supplied, eg HAB1, HAB2, ...
            for every HABx variable a proportion must be given.

        mapping_file: Path | None
            optional file containing the mapping between habitat (HAB) code on the vegetation map
            and Niche vegetation types. The default mapping is a mapping of the
            BWK map (biologische waarderingskaart), and is
            part of the package at niche_vlaanderen/system_tables/hab_nich_join.csv

            If a mapping is provided by the user, it must contain a HAB and a NICHE column.
            Other columns are ignored.

        upscale: int
            optional upscale value that is used to increase the resolution of
            the niche rasters while doing the overlay.
            instead of using only the cell center, the comparison between the
            vegetation map and the niche model is done at upscale*upscale places
            in the cell. Defaults to 5.
            More details in
            https://inbo.github.io/niche_vlaanderen/validation.html

        id: str
            (optional) field to use as id for the provided map file. This id will
            be used in the overlay. If no id is supplied, the shape_index (row number)
            of the vector file will be used, starting from 0.


    """

    def __init__(self, niche, map, mapping_file=None, upscale=5, id=None):
        if type(niche) is Niche:
            self.niche = niche
        else:
            raise ValueError(
                f"pass a valid Niche object - type of niche object is {type(niche)}"
            )

        self.filename_map = map
        self.map = gpd.read_file(map)

        # Prevent wrong mapping of HAB strings to floats
        self.map[[col for col in self.map.columns if col.startswith("HAB")]] = self.map[
            [col for col in self.map.columns if col.startswith("HAB")]].astype(str)

        # the mapping columns contain are the field in the shapefile that contain a field starting with HAB{1-9}
        # (case insensitive)
        columns = self.map.columns
        vegetation_columns = columns[columns.str.upper().str.match("HAB[0-9]")]
        proportion_index = {
            veg: list(columns.str.upper()).index(f"PHAB{veg}")
            for veg in vegetation_columns.str[-1]
        }
        self.proportion_columns = {
            i: columns[proportion_index[i]] for i in proportion_index
        }

        if mapping_file is None:
            mapping_file = package_resource(["system_tables"],
                                            "hab_niche_join.csv")

        mapping = pd.read_csv(mapping_file)

        # drop any row containing niche vegetation 0
        mapping = mapping[mapping["NICHE"] != 0]

        # convert to wide format
        mapping["col"] = mapping.groupby("HAB").cumcount()
        mapping["col"] = "NICHE_C" + (mapping["col"] + 1).astype("str")

        self.mapping = mapping.pivot(
            index="HAB", columns="col", values="NICHE"
        ).reset_index()

        niche_columns = self.map.columns[self.map.columns.str.startswith("NICH_")]
        self.map = self.map.drop(columns=niche_columns)
        self.map["area_shape"] = self.map.area / 10000

        for hab_column in vegetation_columns:
            for nich_column in self.mapping.columns[
                self.mapping.columns.str.startswith("NICHE_C")
            ]:
                logger.debug(f"habi: {hab_column}; nichj: {nich_column}")
                source = self.mapping[["HAB", nich_column]].rename(
                    columns={
                        "HAB": hab_column,
                        nich_column: f"NICH_{hab_column[-1]}_{nich_column[-1]}",
                    }
                )

                self.map = pd.merge(self.map, source, on=hab_column, how="left")

        self._niche_columns = self.map.columns[self.map.columns.str.startswith("NICH")]

        self.id = id
        self.overlay(upscale=upscale)

    def __repr__(self):

        o = "# Niche Validation object\n"
        o += f"map: {self.filename_map}\n"
        o += f"niche object: {self.niche.name}"
        return o


[docs]
    def overlay(self, upscale=5):
        """Overlays the map and the niche object"""

        # Remove any existing "NICH" columns

        present_vegetation_types = np.unique(self.map[self._niche_columns])

        present_vegetation_types = present_vegetation_types[
            ~np.isnan(present_vegetation_types)
        ]
        logger.debug(f"present niche types: {present_vegetation_types}")

        if len(present_vegetation_types) == 0:
            raise NicheValidationException(
                "No Niche vegetation present in validation map"
            )
        # get potential presence
        # upscale niche rasters to that before calculating statistics
        self.potential_presence = self.niche.zonal_stats(
            self.map,
            outside=False,
            vegetation_types=present_vegetation_types,
            upscale=upscale,
        )

        self.potential_presence = self.potential_presence.pivot(
            columns=["vegetation"], index=["presence", "shape_id"]
        )

        self._tables = [
            "area_pot",
            "area_nonpot",
            "area_nonpot_phab",
            "area_pot_perc_phab",
            "area_effective",
            "area_pot_perc",
            "veg_present",
        ]

        self.area_pot = self.potential_presence.loc["no data"]["area_ha"] * np.nan
        self.area_nonpot = self.potential_presence.loc["no data"]["area_ha"] * np.nan
        self.area_effective = self.potential_presence.loc["no data"]["area_ha"] * np.nan
        self.area_pot_perc = self.potential_presence.loc["no data"]["area_ha"] * np.nan
        self.area_pot_perc_phab = self.area_pot_perc * np.nan
        self.area_nonpot_phab = self.area_pot_perc * np.nan
        self.veg_present = self.area_pot_perc * 0

        # Only if actual present: (pHAB * present) / (present + not present)
        for i, row in self.map.iterrows():
            logger.debug(f"row: {row.area_shape}")
            # different mappings can exist which lead to the same vegetation
            # type. First we aggregate them in shape_veg which contains
            # the niche vegetation and the sum of its pHAB
            shape_veg = defaultdict(lambda: 0)

            for veg in self._niche_columns:
                if np.isfinite(row[veg]) and row[veg] != 0:
                    pHab = row[self.proportion_columns[veg[5]]]
                    shape_veg[int(row[veg])] += pHab

            for veg in shape_veg:
                pHab = shape_veg[veg]

                area_pot = (
                    self.potential_presence.loc["present"].loc[i].loc["area_ha"][veg]
                )

                self.area_pot.loc[i, veg] = area_pot

                area_nonpot = (
                    self.potential_presence.loc["not present"]
                    .loc[i]
                    .loc["area_ha"][veg]
                )

                self.area_nonpot.loc[i, veg] = area_nonpot
                area_effective = pHab * (area_pot + area_nonpot) / 100
                self.area_effective.loc[i, veg] = area_effective

                if (area_pot + area_nonpot) == 0:
                    warnings.warn(
                        f"No overlap between potential vegetation map and "
                        f"shape_id {i}"
                    )
                else:
                    # vegetation type is present (actual presence)
                    # used in polygon count
                    self.veg_present.loc[i, veg] = 1

        # aggregate statistics
        self.area_pot_perc = 100 * self.area_pot / (self.area_pot + self.area_nonpot)
        self.area_pot_perc_phab = np.minimum(
            100 * self.area_pot / self.area_effective, 100
        )
        self.area_nonpot_phab = np.maximum(0, self.area_effective - self.area_pot)

        # Set id column for every polygon based table
        if self.id is not None:
            for table in self._tables:
                setattr(self, table, getattr(self, table).set_index(self.map[self.id]))

        # create summary table per vegetation type

        summary = {}
        summary["area_effective"] = self.area_effective.sum()
        summary["nonpot"] = self.area_nonpot.sum()
        summary["nonpot_phab"] = self.area_nonpot_phab.sum()
        summary["pot"] = self.area_pot.sum()
        summary["polygon_count"] = self.veg_present.sum()

        summary["score"] = 100 * summary["pot"] / (summary["pot"] + summary["nonpot"])
        summary["score_phab"] = (
            100 * summary["pot"] / (summary["pot"] + summary["nonpot_phab"])
        )

        self.summary = pd.DataFrame(summary)



[docs]
    def write(self, path, overwrite_files=False):
        """Write the result of the validation to a folder

        path: string | pathlib.Path
            Output folder to which files will be written. Parent directory must
            already exist.

        overwrite_files: bool
            Overwrite files when saving.
            Note writing will fail if any of the files to be written already
            exists.
        """

        path = Path(path)
        if path.exists() and not overwrite_files:
            if not path.is_dir():
                raise NicheValidationException(f"path {path} is not an empty folder")
            if any(path.iterdir()):
                raise NicheValidationException(f"path {path} exists and is not empty")
        path.mkdir(parents=True, exist_ok=True)

        # save individual tables

        for t in self._tables:
            getattr(self, t).dropna(axis=0, how="all").to_csv(path / (t + ".csv"))

        self.potential_presence.to_csv(path / "potential_presence.csv")
        self.summary.to_csv(path / "summary.csv")

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=DeprecationWarning)
            warnings.filterwarnings("ignore", category=FutureWarning)
            self.joined_map().to_file(path / "validation.gpkg")



[docs]
    def joined_map(self):
        """Create a geopandas dataframe with all tables joined"""
        merged = self.map
        if self.id:
            merged = merged.set_index(self.id)
        for table in self._tables:
            merged = merged.join(getattr(self, table).add_prefix(f"{table}_"))

        return merged