Module autostreamtree.cli

Expand source code
import sys
import pandas as pd
import numpy as np
import geopandas as gpd
import pyogrio
import random
import matplotlib.pyplot as plt
import time

import autostreamtree.functions as ast
from autostreamtree.params import parseArgs
import autostreamtree.cluster_pops as clust
import autostreamtree.report_refs as ref
import autostreamtree.aggregators as agg


def main():

    params = parseArgs()

    # Set the seed for random number generation
    if params.seed is not None:
        np.random.seed(params.seed)
        random.seed(params.seed)
    else:
        # Use clock time to seed RNG if params.seed is None
        clock_seed = int(time.time() * 1000) % (2**32)
        np.random.seed(clock_seed)
        random.seed(clock_seed)

    # Step 1: Reading in datasets
    print("Running autostream tree", params.run, "workflow\n")
    G = ast.read_network(params.network, params.shapefile)
    # prune graph if list of edges passed
    if params.edge_list:
        df = pd.read_csv(params.edge_list, sep='\t')
        edge_list = df[params.reachid_col].tolist()
        G = ast.prune_graph(G, edge_list, params.reachid_col)

    # read point coordinate data
    points = pd.read_csv(params.geodb, sep="\t", header=0)
    (point_coords, pop_coords, popmap) = ast.process_samples(params, points, G)

    # read genetic data from VCF
    if params.run != "STREAMDIST":
        seqs = ast.read_vcf(params.vcf, concat=params.concat, popmap=popmap)

    # Step 2: Calculating genetic distance matrix(-ces)
    # Optional: Calculate separately for each locus

    gen = None
    pop_gen = None
    # calculate genetic distance matrix
    if params.run != "STREAMDIST" and params.run != "RUNLOCI":
        if not params.genmat:
            print("\nCalculating genetic distances...")
            (gen, pop_gen) = ast.get_gendist_mats(params, point_coords, popmap,
                                                  seqs)
        else:
            print("\nReading genetic distances from provided matrix:",
                  params.genmat)
            inmat = pd.read_csv(params.genmat, header=0, index_col=0, sep="\t")
            (gen, pop_gen) = ast.parse_input_genmat(params, inmat,
                                                    point_coords, popmap, seqs)

        ast.report_genmats(params, gen, pop_gen, point_coords, pop_coords)

        # exit if this was the only step requested
        if params.run == "GENDIST":
            sys.exit(0)

    # IMPORTANT NOTE: This function will silently skip any loci for which
    # calculations aren't possible (e.g., population having no data)
    if params.run == "RUNLOCI":
        genlist = list()
        popgenlist = list()
        for loc in ast.get_loc_data(seqs):
            try:
                (gen, pop_gen) = ast.get_gendist_mats(params, point_coords,
                                                      popmap, loc)
                genlist.append(gen)
                popgenlist.append(pop_gen)
            except Exception:
                pass
        print(
            "\nCalculating average of locus-wise genetic distance matrices..."
        )
        if genlist[0] is not None:
            gen = np.mean(genlist, axis=0)
        else:
            gen = genlist[0]
        if popgenlist[0] is not None:
            pop_gen = np.mean(popgenlist, axis=0)
        else:
            pop_gen = popgenlist[0]

        ast.report_genmats_list(params, genlist, popgenlist, point_coords,
                                pop_coords)

    # Step 4: Constructing a minimal subgraph

    # Extract subgraph if needed
    if params.run != "GENDIST":
        K = ast.parse_subgraph_from_points(params, point_coords, pop_coords, G)

    # Step 5: Construct stream distance matrix
    # Optional: Mantel test

    # calculate pairwise observed stream distances and indence matrix
    # calculate incidence matrix X, which takes the form:
    # nrows = rows in column vector form of D
    # ncols = number of collapsed branches in stream network K
    if params.run in ["STREAMDIST", "DISTANCES", "STREAMTREE", "IBD", "ALL",
                      "RUNLOCI"]:
        if params.pop or params.geopop or params.clusterpop:
            points = pop_coords
            gen = pop_gen
        else:
            points = point_coords

        # calculate stream distances and incidence matrix
        (sdist, inc) = ast.get_stream_mats(points, K, params.length_col)
        print("\nStream distances:")
        print(sdist)
        sDF = pd.DataFrame(sdist, columns=list(points.keys()),
                           index=list(points.keys()))
        sDF.to_csv((str(params.out) + ".streamDistMat.txt"), sep="\t",
                   index=True)
        del sDF
        if params.run == "STREAMDIST":
            sys.exit(0)

        # if user requested testing isolation-by-distance:
        if params.run in ['ALL', 'IBD']:
            # IBD calculations and plots
            print("\nTesting for isolation-by-distance using Mantel test with",
                  params.permutations, "permutations")
            ast.test_ibd(gen, sdist, params.out, params.permutations)

            # plot geographic x genetic DISTANCES
            ast.plot_gen_by_geo(gen, sdist, params.out)

            # if log request, re-do with logged geographic distances
            if params.and_log:
                print("\nTesting for isolation-by-distance with log",
                      "geographic distances using Mantel test with",
                      params.permutations, "permutations")
                out = params.out+".log"
                ast.test_ibd(gen, sdist, out, params.permutations, log=True)
                ast.plot_gen_by_geo(gen, sdist, out, log=True)

    #########################################################
    # Step 6: Least-squares fitting
    # Optional: Weighted least-squares
    # Optional: Fit for each locus
    #########################################################

    if params.run in ["STREAMTREE", "ALL", "RUNLOCI"]:
        if params.pop or params.geopop or params.clusterpop:
            gen = pop_gen
        print("\nIncidence matrix:")
        print(inc)
        ofh = params.out+".incidenceMatrix.txt"
        with np.printoptions(precision=0, suppress=True):
            np.savetxt(ofh, inc, delimiter="\t")
        print("Incidence matrix dimensions:")
        print(inc.shape)

        # fit least-squares branch lengths
        if params.run != "RUNLOCI":
            print()
            R = ast.fit_least_squares_distances(gen, inc.astype(int),
                                                params.iterative,
                                                params.out, params.weight)
            print("\nFitted least-squares distances:")
            print(R)
        else:
            print()
            if params.pop or params.geopop or params.clusterpop:
                genlist = popgenlist
            print("Fitting StreamTree distances on per-locus matrices...")
            Rlist = list()
            ast.block_print()
            for gen in genlist:
                r = ast.fit_least_squares_distances(gen, inc.astype(int),
                                                    params.iterative,
                                                    params.out, params.weight)
                Rlist.append(r)
            ast.enable_print()

            # aggregate locus distances (mean)
            print("\nCalculating average fitted distances across loci using:",
                  params.loc_agg)
            averageR = np.array(
                [agg.aggregate_dist(params.loc_agg, col) for
                 col in zip(*Rlist)])
            print(averageR)

            # get standard-deviation locus distances as well
            print("\nCalculating standard-deviation of fitted distances",
                  "across loci using:", params.loc_agg)
            sdR = np.array([agg.aggregate_dist("SD", col) for
                            col in zip(*Rlist)])
            print(sdR)

        # check observed versus fitted distances:
        if params.run == "RUNLOCI":
            R = averageR
        pred = ast.get_fitted_d(points, gen, inc, R)
        print("\nComparing observed versus predicted genetic distances:")
        print(pred)
        ast.output_fitted_d(pred, params.out)

        # Step 7: Outputs

        # Now, annotate originate geoDF with dissolved reach IDs
        # also, need to collect up the stream tree fitted D to each reach
        # finally, could add residuals of fitting D vs LENGTH_KM?
        # maybe include logDxlength, DxlogLength, logDxlogLength as well?

        # get list of all REACHIDs to extract from geoDF
        # edge_data = nx.get_edge_attributes(K,params.reachid_col)
        reach_to_edge = dict()
        i = 0
        edges = list()
        for e in K.edges():
            edge_data = K[e[0]][e[1]][params.reachid_col]
            for r in edge_data:
                reach_to_edge[r] = str(i)
            edges.append(i)
            i += 1
        del edge_data

        # save reach_to_edge table to file
        r2eDF = pd.DataFrame(list(reach_to_edge.items()),
                             columns=[params.reachid_col, 'EDGE_ID'])
        r2eDF.to_csv((str(params.out)+".reachToEdgeTable.txt"),
                     sep="\t", index=False)

        # read in original shapefile as geoDF and subset it
        print("\nExtracting attributes from original dataframe...")
        # geoDF = gpd.read_file(params.shapefile)
        geoDF = pyogrio.read_dataframe(params.shapefile)
        mask = geoDF[params.reachid_col].isin(list(reach_to_edge.keys()))
        temp = geoDF.loc[mask]
        del mask
        del reach_to_edge

        # join EDGE_ID to geoDF
        geoDF = temp.merge(r2eDF, on=params.reachid_col, how='left')
        for col in geoDF.columns:
            if col.endswith('_x'):
                geoDF.drop(col, axis=1, inplace=True)
            elif col.endswith('_y'):
                geoDF.rename(columns={col: col.rstrip('_y')}, inplace=True)
        print(geoDF)
        del temp
        del r2eDF

        # plot by edge ID
        base = geoDF.plot(column="EDGE_ID", cmap="prism")
        coords = clust.coords_to_dataframe(points)
        geo_coords = gpd.GeoDataFrame(coords,
                                      geometry=gpd.points_from_xy(coords.long,
                                                                  coords.lat))
        geo_coords.plot(ax=base, marker='o', color='black', markersize=10,
                        zorder=10)

        plt.title("Stream network colored by EDGE_ID")
        plt.savefig((str(params.out)+".streamsByEdgeID.pdf"))

        # add in fitted distances & plot
        print("\nPlotting StreamTree fitted distances and writing new",
              "shapefile...")

        if params.run == "RUNLOCI":
            fittedD = pd.DataFrame({'EDGE_ID': list(edges), 'fittedD': R})
            sdD = pd.DataFrame({'EDGE_ID': list(edges), 'stdevD': sdR})
            i = 1
            for locfit in Rlist:
                name = "locD_" + str(i)
                fittedD[name] = locfit
                i += 1
        else:
            fittedD = pd.DataFrame({'EDGE_ID': list(edges), 'fittedD': R})
        geoDF['EDGE_ID'] = geoDF['EDGE_ID'].astype(int)
        # geoDF.drop(columns=["EDGE_ID_x", "EDGE_ID_y"], inplace=True)
        geoDF = geoDF.merge(fittedD, on='EDGE_ID', how="left")
        for col in geoDF.columns:
            if col.endswith('_x'):
                geoDF.drop(col, axis=1, inplace=True)
            elif col.endswith('_y'):
                geoDF.rename(columns={col: col.rstrip('_y')}, inplace=True)
        if params.run == "RUNLOCI":
            geoDF = geoDF.merge(sdD, on='EDGE_ID')
            geoDF.plot(column="stdevD", cmap="RdYlGn_r", legend=True)
            plt.title("Stream network colored by standard deviation of",
                      "StreamTree fitted distances")
            plt.savefig((str(params.out)+".streamsBystdevD.pdf"))
        geoDF.plot(column="fittedD", cmap="RdYlGn_r", legend=True)
        plt.title("Stream network colored by StreamTree fitted distances")
        plt.savefig((str(params.out)+".streamsByFittedD.pdf"))

        # output a final annotated stream network layer
        geoDF.to_csv((str(params.out)+".streamTree.txt"), sep="\t",
                     index=False)
        ast.write_geodataframe(geoDF, (str(params.out)+".streamTree"),
                               params.output_driver)

    refs = ref.fetch_references(params)
    print(refs)

    print("\nDone!\n")


# Call main function
if __name__ == '__main__':
    main()

Functions

def main()
Expand source code
def main():

    params = parseArgs()

    # Set the seed for random number generation
    if params.seed is not None:
        np.random.seed(params.seed)
        random.seed(params.seed)
    else:
        # Use clock time to seed RNG if params.seed is None
        clock_seed = int(time.time() * 1000) % (2**32)
        np.random.seed(clock_seed)
        random.seed(clock_seed)

    # Step 1: Reading in datasets
    print("Running autostream tree", params.run, "workflow\n")
    G = ast.read_network(params.network, params.shapefile)
    # prune graph if list of edges passed
    if params.edge_list:
        df = pd.read_csv(params.edge_list, sep='\t')
        edge_list = df[params.reachid_col].tolist()
        G = ast.prune_graph(G, edge_list, params.reachid_col)

    # read point coordinate data
    points = pd.read_csv(params.geodb, sep="\t", header=0)
    (point_coords, pop_coords, popmap) = ast.process_samples(params, points, G)

    # read genetic data from VCF
    if params.run != "STREAMDIST":
        seqs = ast.read_vcf(params.vcf, concat=params.concat, popmap=popmap)

    # Step 2: Calculating genetic distance matrix(-ces)
    # Optional: Calculate separately for each locus

    gen = None
    pop_gen = None
    # calculate genetic distance matrix
    if params.run != "STREAMDIST" and params.run != "RUNLOCI":
        if not params.genmat:
            print("\nCalculating genetic distances...")
            (gen, pop_gen) = ast.get_gendist_mats(params, point_coords, popmap,
                                                  seqs)
        else:
            print("\nReading genetic distances from provided matrix:",
                  params.genmat)
            inmat = pd.read_csv(params.genmat, header=0, index_col=0, sep="\t")
            (gen, pop_gen) = ast.parse_input_genmat(params, inmat,
                                                    point_coords, popmap, seqs)

        ast.report_genmats(params, gen, pop_gen, point_coords, pop_coords)

        # exit if this was the only step requested
        if params.run == "GENDIST":
            sys.exit(0)

    # IMPORTANT NOTE: This function will silently skip any loci for which
    # calculations aren't possible (e.g., population having no data)
    if params.run == "RUNLOCI":
        genlist = list()
        popgenlist = list()
        for loc in ast.get_loc_data(seqs):
            try:
                (gen, pop_gen) = ast.get_gendist_mats(params, point_coords,
                                                      popmap, loc)
                genlist.append(gen)
                popgenlist.append(pop_gen)
            except Exception:
                pass
        print(
            "\nCalculating average of locus-wise genetic distance matrices..."
        )
        if genlist[0] is not None:
            gen = np.mean(genlist, axis=0)
        else:
            gen = genlist[0]
        if popgenlist[0] is not None:
            pop_gen = np.mean(popgenlist, axis=0)
        else:
            pop_gen = popgenlist[0]

        ast.report_genmats_list(params, genlist, popgenlist, point_coords,
                                pop_coords)

    # Step 4: Constructing a minimal subgraph

    # Extract subgraph if needed
    if params.run != "GENDIST":
        K = ast.parse_subgraph_from_points(params, point_coords, pop_coords, G)

    # Step 5: Construct stream distance matrix
    # Optional: Mantel test

    # calculate pairwise observed stream distances and indence matrix
    # calculate incidence matrix X, which takes the form:
    # nrows = rows in column vector form of D
    # ncols = number of collapsed branches in stream network K
    if params.run in ["STREAMDIST", "DISTANCES", "STREAMTREE", "IBD", "ALL",
                      "RUNLOCI"]:
        if params.pop or params.geopop or params.clusterpop:
            points = pop_coords
            gen = pop_gen
        else:
            points = point_coords

        # calculate stream distances and incidence matrix
        (sdist, inc) = ast.get_stream_mats(points, K, params.length_col)
        print("\nStream distances:")
        print(sdist)
        sDF = pd.DataFrame(sdist, columns=list(points.keys()),
                           index=list(points.keys()))
        sDF.to_csv((str(params.out) + ".streamDistMat.txt"), sep="\t",
                   index=True)
        del sDF
        if params.run == "STREAMDIST":
            sys.exit(0)

        # if user requested testing isolation-by-distance:
        if params.run in ['ALL', 'IBD']:
            # IBD calculations and plots
            print("\nTesting for isolation-by-distance using Mantel test with",
                  params.permutations, "permutations")
            ast.test_ibd(gen, sdist, params.out, params.permutations)

            # plot geographic x genetic DISTANCES
            ast.plot_gen_by_geo(gen, sdist, params.out)

            # if log request, re-do with logged geographic distances
            if params.and_log:
                print("\nTesting for isolation-by-distance with log",
                      "geographic distances using Mantel test with",
                      params.permutations, "permutations")
                out = params.out+".log"
                ast.test_ibd(gen, sdist, out, params.permutations, log=True)
                ast.plot_gen_by_geo(gen, sdist, out, log=True)

    #########################################################
    # Step 6: Least-squares fitting
    # Optional: Weighted least-squares
    # Optional: Fit for each locus
    #########################################################

    if params.run in ["STREAMTREE", "ALL", "RUNLOCI"]:
        if params.pop or params.geopop or params.clusterpop:
            gen = pop_gen
        print("\nIncidence matrix:")
        print(inc)
        ofh = params.out+".incidenceMatrix.txt"
        with np.printoptions(precision=0, suppress=True):
            np.savetxt(ofh, inc, delimiter="\t")
        print("Incidence matrix dimensions:")
        print(inc.shape)

        # fit least-squares branch lengths
        if params.run != "RUNLOCI":
            print()
            R = ast.fit_least_squares_distances(gen, inc.astype(int),
                                                params.iterative,
                                                params.out, params.weight)
            print("\nFitted least-squares distances:")
            print(R)
        else:
            print()
            if params.pop or params.geopop or params.clusterpop:
                genlist = popgenlist
            print("Fitting StreamTree distances on per-locus matrices...")
            Rlist = list()
            ast.block_print()
            for gen in genlist:
                r = ast.fit_least_squares_distances(gen, inc.astype(int),
                                                    params.iterative,
                                                    params.out, params.weight)
                Rlist.append(r)
            ast.enable_print()

            # aggregate locus distances (mean)
            print("\nCalculating average fitted distances across loci using:",
                  params.loc_agg)
            averageR = np.array(
                [agg.aggregate_dist(params.loc_agg, col) for
                 col in zip(*Rlist)])
            print(averageR)

            # get standard-deviation locus distances as well
            print("\nCalculating standard-deviation of fitted distances",
                  "across loci using:", params.loc_agg)
            sdR = np.array([agg.aggregate_dist("SD", col) for
                            col in zip(*Rlist)])
            print(sdR)

        # check observed versus fitted distances:
        if params.run == "RUNLOCI":
            R = averageR
        pred = ast.get_fitted_d(points, gen, inc, R)
        print("\nComparing observed versus predicted genetic distances:")
        print(pred)
        ast.output_fitted_d(pred, params.out)

        # Step 7: Outputs

        # Now, annotate originate geoDF with dissolved reach IDs
        # also, need to collect up the stream tree fitted D to each reach
        # finally, could add residuals of fitting D vs LENGTH_KM?
        # maybe include logDxlength, DxlogLength, logDxlogLength as well?

        # get list of all REACHIDs to extract from geoDF
        # edge_data = nx.get_edge_attributes(K,params.reachid_col)
        reach_to_edge = dict()
        i = 0
        edges = list()
        for e in K.edges():
            edge_data = K[e[0]][e[1]][params.reachid_col]
            for r in edge_data:
                reach_to_edge[r] = str(i)
            edges.append(i)
            i += 1
        del edge_data

        # save reach_to_edge table to file
        r2eDF = pd.DataFrame(list(reach_to_edge.items()),
                             columns=[params.reachid_col, 'EDGE_ID'])
        r2eDF.to_csv((str(params.out)+".reachToEdgeTable.txt"),
                     sep="\t", index=False)

        # read in original shapefile as geoDF and subset it
        print("\nExtracting attributes from original dataframe...")
        # geoDF = gpd.read_file(params.shapefile)
        geoDF = pyogrio.read_dataframe(params.shapefile)
        mask = geoDF[params.reachid_col].isin(list(reach_to_edge.keys()))
        temp = geoDF.loc[mask]
        del mask
        del reach_to_edge

        # join EDGE_ID to geoDF
        geoDF = temp.merge(r2eDF, on=params.reachid_col, how='left')
        for col in geoDF.columns:
            if col.endswith('_x'):
                geoDF.drop(col, axis=1, inplace=True)
            elif col.endswith('_y'):
                geoDF.rename(columns={col: col.rstrip('_y')}, inplace=True)
        print(geoDF)
        del temp
        del r2eDF

        # plot by edge ID
        base = geoDF.plot(column="EDGE_ID", cmap="prism")
        coords = clust.coords_to_dataframe(points)
        geo_coords = gpd.GeoDataFrame(coords,
                                      geometry=gpd.points_from_xy(coords.long,
                                                                  coords.lat))
        geo_coords.plot(ax=base, marker='o', color='black', markersize=10,
                        zorder=10)

        plt.title("Stream network colored by EDGE_ID")
        plt.savefig((str(params.out)+".streamsByEdgeID.pdf"))

        # add in fitted distances & plot
        print("\nPlotting StreamTree fitted distances and writing new",
              "shapefile...")

        if params.run == "RUNLOCI":
            fittedD = pd.DataFrame({'EDGE_ID': list(edges), 'fittedD': R})
            sdD = pd.DataFrame({'EDGE_ID': list(edges), 'stdevD': sdR})
            i = 1
            for locfit in Rlist:
                name = "locD_" + str(i)
                fittedD[name] = locfit
                i += 1
        else:
            fittedD = pd.DataFrame({'EDGE_ID': list(edges), 'fittedD': R})
        geoDF['EDGE_ID'] = geoDF['EDGE_ID'].astype(int)
        # geoDF.drop(columns=["EDGE_ID_x", "EDGE_ID_y"], inplace=True)
        geoDF = geoDF.merge(fittedD, on='EDGE_ID', how="left")
        for col in geoDF.columns:
            if col.endswith('_x'):
                geoDF.drop(col, axis=1, inplace=True)
            elif col.endswith('_y'):
                geoDF.rename(columns={col: col.rstrip('_y')}, inplace=True)
        if params.run == "RUNLOCI":
            geoDF = geoDF.merge(sdD, on='EDGE_ID')
            geoDF.plot(column="stdevD", cmap="RdYlGn_r", legend=True)
            plt.title("Stream network colored by standard deviation of",
                      "StreamTree fitted distances")
            plt.savefig((str(params.out)+".streamsBystdevD.pdf"))
        geoDF.plot(column="fittedD", cmap="RdYlGn_r", legend=True)
        plt.title("Stream network colored by StreamTree fitted distances")
        plt.savefig((str(params.out)+".streamsByFittedD.pdf"))

        # output a final annotated stream network layer
        geoDF.to_csv((str(params.out)+".streamTree.txt"), sep="\t",
                     index=False)
        ast.write_geodataframe(geoDF, (str(params.out)+".streamTree"),
                               params.output_driver)

    refs = ref.fetch_references(params)
    print(refs)

    print("\nDone!\n")