Justice Centric CodeΒΆ

Note

Needs to documented better. This is just a copy and paste of the code. I will review it and document accordingly asap. I will also makeit executable by importing in an ipynb at a later date.

#supremeCourtPrediction.py
#a program that uses the segal and spaeth dataset to predict the outcomes of cases


import pandas as pd

import tensorflow as tf

import math

import seaborn as sns
import matplotlib.pyplot as plt

import shap
shap.initjs()

import os 
import csv
import datetime

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

from itertools import permutations
from itertools import combinations_with_replacement

import matplotlib.pyplot as plt


def get_environmental_variables():
    cwd = os.getcwd()
    log_dir = os.path.join(cwd,"logs/fit/") + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    return cwd, log_dir

def load_data(cwd):
    data_path = os.path.join(cwd,'data','justice.csv') 
    drop_path = os.path.join(cwd,'data','features','drop','drop.txt')
    target_path = os.path.join(cwd,'data','features','targets.txt')
    print(data_path)
    dataframe = pd.read_csv(data_path,encoding= 'unicode_escape')
    #print(dataframe.head())
    
    with open(drop_path) as fp:
        for cnt, line in enumerate(fp):
            #print("Dropping {} from dataframe".format(line))
            line = line.strip('\n')
            line = line.strip('\t')
            line = line.strip("'")

            dataframe.drop(line,axis=1,inplace=True)
    
    dataframe.fillna(0, inplace=True)
    print(dataframe)
    #dataframe.issue = dataframe.astype({'issue': 'str'})
    #dataframe[issue] = dataframe[issue].astype(str)
    #print(dataframe)
    #print(dataframe.head())
    return dataframe

def split_datframe(dataframe):
    train, test = train_test_split(dataframe, test_size=0.2)
    train, val = train_test_split(train, test_size=0.2)
    print(len(train), 'train examples')
    print(len(val), 'validation examples')
    print(len(test), 'test examples')
    return train, test, val

    # A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('vote')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    print(ds)
    return ds

def get_input_pipeline(train, test, val, batch_size=32, shuffle=True):
    train_ds = df_to_dataset(train, batch_size=batch_size)
    val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
    test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)
    return train_ds, val_ds, test_ds

def get_feature_layer(cwd): #select the columns for analysis from dataset
    feature_path = os.path.join(cwd,'data','features','use','features.txt')
    dict_path = os.path.join(cwd,'data','features','use')
    feature_columns = []
    feature_list = []
    #embedded columns
    with open(feature_path) as fp:
        for cnt, line in enumerate(fp):
            #print("Adding {} to features".format(line))
            line = line.strip('\n')
            line = line.strip('\t')
            line = line.strip("'")

            print(cnt)   
            print(line)
            feature_list.append(line)

            indicator = feature_column.numeric_column(line)
            
            #feature_column.categorical_column_with_vocabulary_file(
            #            key = line, vocabulary_file = os.path.join(dict_path,"{}.txt".format(line)), default_value=0)

            print(indicator)
            feature_columns.append(indicator)


            #feature_columns.append(feature_column.embedding_column(indicator, dimension=8))

    feature_comb = combinations_with_replacement(feature_list, 2) 
    lst = list(feature_comb)
    limit = len(feature_list)
    print(limit)
    j = 0
    k = limit - 1
    lookup = []
    for i in range(limit):
        lookup.append(i+j)
        j = j + k
        k = k - 1 


    for i in range(len(lst)):
        if i in lookup:
            continue
        else:
            one, two = lst[i]
            crossed_feature = feature_column.crossed_column([one, two], hash_bucket_size=1000)
            crossed_feature = feature_column.indicator_column(crossed_feature)
            feature_columns.append(crossed_feature)

    """
    crossed_feature = feature_column.crossed_column(['issue', 'naturalCourt'], hash_bucket_size=1000)
    crossed_feature = feature_column.indicator_column(crossed_feature)
    feature_columns.append(crossed_feature)

    crossed_feature = feature_column.crossed_column(['issueArea', 'naturalCourt'],hash_bucket_size=1000)
    crossed_feature = feature_column.indicator_column(crossed_feature)
    feature_columns.append(crossed_feature)
    """
   # court_buckets = feature_column.bucketized_column(naturalCourt, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

    #print(feature_columns)
    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
    #for item in feature_columns:
    #    print(item)
    #    print("\n")

    return feature_layer
     
    #crossed cols


    #categorical_columns 
def understand_input_pipeline(train_ds):
    for feature_batch, label_batch in train_ds.take(1):
        print('Every feature:', list(feature_batch.keys()))
        #print('A batch of ages:', feature_batch['age'])
        print('A batch of targets:', label_batch )

def create_model(log_dir,feature_layer, train_ds, val_ds, epochs = 8):
    
    model = tf.keras.Sequential([
        feature_layer,
        layers.Dense(128, activation='relu'),
        layers.Dense(128, activation='relu'),
        layers.Dense(1)
    ])

    model.compile(optimizer='adam',
            loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
            metrics=['accuracy', 'mae'])

    history = model.fit(train_ds,
            validation_data=val_ds,
            epochs=epochs)

    loss, accuracy,mae = model.evaluate(test_ds)
    print("Accuracy", accuracy)
    print(model.summary())
    return model,history

def plot_history(history):

    hist = pd.DataFrame(history.history)
    print(hist)
    hist['epoch'] = history.epoch

    plt.figure()
    plt.xlabel('Epoch')
    plt.ylabel('Mean Absolute Error')
    plt.plot(hist['epoch'], hist['mae'],
            label='Train Error')
    plt.plot(hist['epoch'], hist['val_mae'],
            label = 'Val Error')
    plt.legend()

def explain_kernal(model,train_ds):
    model.predict(train_ds)
    # KernelExplainer is a general approach that can work with any ML framework
    # Its inputs are the predictions and training data

    # Summarize the training set to accelerate analysis
    df_train_summary = shap.kmeans(train_ds.values, 25)
    #print(df_train_summary)
    #model.predict(df_train_summary)
    # Instantiate an explainer with the model predictions and training data summary
    explainer = shap.KernelExplainer(model.predict, train_ds)

# Extract Shapley values from the explainer
    #shap_values = explainer.shap_values(df_train.values)
if __name__ == "__main__":
    cwd, log_dir = get_environmental_variables()
    print(cwd)
    df = load_data(cwd)
    train,test,val = split_datframe(df)
    dataset = df_to_dataset(df)
    train_ds, val_ds, test_ds = get_input_pipeline(train,test,val,32)
    understand_input_pipeline(train_ds)
    feature_layer = get_feature_layer(cwd)
    model, model_history = create_model(log_dir,feature_layer, train_ds, val_ds)
    #plot_history(model_history)
    #print(model.predict(train_ds))
    #print(model.predict)
    #explain_kernal(model,train_ds)