Vous êtes sur la page 1sur 18

{

"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "working Project.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "wC3t9KM0NKt0",
"colab_type": "text"
},
"source": [
"## Analysis of Suicide Rate Data Informed by Human Rights Scores,
Unemployment, and GDP\n",
"By Stephanie Ford and Emily Thomas\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "9KLr2I2XwyqZ",
"colab_type": "text"
},
"source": [
"### Reading, Cleaning and Merging Datasets"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "lenQ4wtjwtF-",
"colab_type": "text"
},
"source": [
"Importing Relevant Libraries:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "kI60kZaLOxDo",
"colab_type": "code",
"outputId": "04234699-4c76-483d-9963-a9e10ce40d52",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 255
}
},
"source": [
"import pandas as pd\n",
"!pip install --upgrade geopandas\n",
"import geopandas as gpd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.tree import export_graphviz\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.model_selection import train_test_split\n",
"import random\n",
"from numpy.testing import assert_equal\n",
"\n",
"%matplotlib inline"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"Requirement already up-to-date: geopandas in
/usr/local/lib/python3.6/dist-packages (0.7.0)\n",
"Requirement already satisfied, skipping upgrade: fiona in
/usr/local/lib/python3.6/dist-packages (from geopandas) (1.8.13.post1)\n",
"Requirement already satisfied, skipping upgrade: pyproj>=2.2.0 in
/usr/local/lib/python3.6/dist-packages (from geopandas) (2.5.0)\n",
"Requirement already satisfied, skipping upgrade: pandas>=0.23.0 in
/usr/local/lib/python3.6/dist-packages (from geopandas) (0.25.3)\n",
"Requirement already satisfied, skipping upgrade: shapely in
/usr/local/lib/python3.6/dist-packages (from geopandas) (1.7.0)\n",
"Requirement already satisfied, skipping upgrade: attrs>=17 in
/usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (19.3.0)\n",
"Requirement already satisfied, skipping upgrade: six>=1.7 in
/usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (1.12.0)\n",
"Requirement already satisfied, skipping upgrade: click<8,>=4.0 in
/usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (7.0)\n",
"Requirement already satisfied, skipping upgrade: cligj>=0.5 in
/usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (0.5.0)\n",
"Requirement already satisfied, skipping upgrade: click-plugins>=1.0 in
/usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (1.1.1)\n",
"Requirement already satisfied, skipping upgrade: munch in
/usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (2.5.0)\n",
"Requirement already satisfied, skipping upgrade: python-
dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.23.0-
>geopandas) (2.6.1)\n",
"Requirement already satisfied, skipping upgrade: pytz>=2017.2 in
/usr/local/lib/python3.6/dist-packages (from pandas>=0.23.0->geopandas)
(2018.9)\n",
"Requirement already satisfied, skipping upgrade: numpy>=1.13.3 in
/usr/local/lib/python3.6/dist-packages (from pandas>=0.23.0->geopandas) (1.17.5)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ayh8XkrsNWnP",
"colab_type": "text"
},
"source": [
"Reading in and Cleaning the Data:"
]
},
{
"cell_type": "code",
"metadata": {
"id": "PnJYI_gXNdZO",
"colab_type": "code",
"colab": {}
},
"source": [
"# Country Shape File:\n",
"def clean_country_shape(country_shape_file):\n",
" '''\n",
" This function reads and cleans a country shape file to \n",
" the desired form, removing all columns other than sovereignt\n",
" and geometry, and Country names of interest that do not match\n",
" the other datasets are renamed.\n",
" '''\n",
" geodata = gpd.read_file(country_shape_file)\n",
" geodata = geodata[['SOVEREIGNT', 'geometry']]\n",
" count = 0\n",
" for country in geodata['SOVEREIGNT']:\n",
"\n",
" if country == 'United States of America':\n",
" geodata.loc[count, 'SOVEREIGNT'] = 'United States'\n",
" count += 1\n",
" assert geodata.shape == (177, 2)\n",
" return geodata\n",
"\n",
"\n",
"# Human Rights Scores Data:\n",
"def clean_human_rights(hr_file):\n",
" '''\n",
" This function reads and cleans a human rights dataset to the\n",
" desired form, renaming the HR column.\n",
" '''\n",
" human_rights = pd.read_csv(hr_file)\n",
" # Renaming because column name very long\n",
" human_rights = human_rights.rename(\n",
" columns=\n",
" {'Human Rights Protection Scores – by Christopher Farris and Keith
Schnakenberg':\n",
" 'HR_score'})\n",
" assert human_rights.shape == (11717, 4)\n",
" assert_equal(list(human_rights.columns), ['Entity', 'Code', 'Year',
'HR_score'])\n",
" return human_rights\n",
"\n",
"\n",
"# Suicide Data:\n",
"def clean_suicide_data(suicide_file):\n",
" '''\n",
" This function reads in a suicide file into a pandas dataframe\n",
" and cleans it to the desired form, keeping only country, year, \n",
" suicide/100k pop, gdp per capita, and human development index\n",
" score columns. This function also aggregates the suicides into\n",
" an overall rate per country per year, removing rates specific\n",
" to generations and male vs female. Country names of interest \n",
" that do not match the other datasets are renamed.\n",
" '''\n",
" suicide = pd.read_csv(suicide_file)\n",
" overall_suicide = suicide[['country', 'year','suicides/100k pop',\n",
" 'gdp_per_capita ($)', 'HDI for year']]\n",
" overall_suicide = overall_suicide.groupby(['country',
'year']).mean()\n",
" overall_suicide = overall_suicide.reset_index()\n",
"\n",
" # Rename countries to match other datasets\n",
" count = 0\n",
" for country in overall_suicide['country']:\n",
" if country == 'Russian Federation':\n",
" overall_suicide.loc[count, 'country'] = 'Russia'\n",
" count += 1\n",
" assert overall_suicide.shape == (2321, 5)\n",
" assert_equal(list(overall_suicide.columns), ['country', 'year',\n",
" 'suicides/100k pop',\n",
" 'gdp_per_capita ($)',\n",
" 'HDI for year'])\n",
" return overall_suicide\n",
"\n",
"\n",
"# Unemployment Data:\n",
"def clean_unemployment(unemp_file):\n",
" '''\n",
" This function reads in an unemployment file into a pandas dataframe\n",
" and cleans it to the desired form. The original dataframe is\n",
" formatted such that the years are columns and the rows are
countries,\n",
" and the values are the unemployment rate for the given year and
country.\n",
" In order to reformat this to have year and country as rows, a loop\n",
" was used. Also, the year values in this dataset are strings,
compared\n",
" with the other datasets that were used, which had int years. Country\n",
" names of interest that do not match the other datasets are renamed.\n",
" '''\n",
" unemp = pd.read_csv(unemp_file, skiprows=2, header=1)\n",
" unemp = unemp.drop(columns=['Country Code', 'Indicator Name',\n",
" 'Indicator Code'])\n",
"\n",
" years = range(1985, 2016)\n",
" str_years = []\n",
" for year in years:\n",
" str_years.append(str(year))\n",
" str_years.append('Country Name')\n",
"\n",
" small_unemp = unemp.loc[:, str_years] \n",
" small_unemp.head()\n",
" countries = small_unemp['Country Name']\n",
" years = small_unemp.drop(columns=['Country Name'])\n",
" years = years.columns\n",
"\n",
" unemployment = pd.DataFrame(columns=['Country', 'Year',
'Unemployment'])\n",
" for country in countries:\n",
" count = 0\n",
" for year in years:\n",
" unempl = small_unemp.loc[count, year]\n",
" curr = pd.DataFrame(data={'Country': country, 'Year': year,\n",
" 'Unemployment': unempl},
index=[count])\n",
" unemployment = unemployment.append(curr)\n",
" count += 1\n",
" unemployment = unemployment.reset_index()\n",
"\n",
" # Rename countries to match other datasets\n",
" count = 0\n",
" for country in unemployment['Country']:\n",
" if country == 'Russian Federation':\n",
" unemployment.loc[count, 'Country'] = 'Russia'\n",
" count += 1\n",
" assert unemployment.shape == (8184, 4)\n",
" assert_equal(list(unemployment.columns), ['index', 'Country',
'Year',\n",
" 'Unemployment'])\n",
" return unemployment"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "zGc8O7fSrNzZ",
"colab_type": "text"
},
"source": [
"Merging the Dataframes: <br>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "xSG1l9lodjeS",
"colab_type": "code",
"colab": {}
},
"source": [
"def merge_df(suicide_df, geo_df, hr_df, unemp_df):\n",
" '''\n",
" This function merges all dataframes used in this analysis,\n",
" suicide, geospatail, human rights, and unemployment, on the\n",
" basis of matching year and country values, using an outer\n",
" join. The duplicate columns (multiple year columns, etc) are\n",
" removed\n",
" '''\n",
"\n",
" suicide_and_geo = geo_df.merge(suicide_df, right_on='country',\n",
" left_on='SOVEREIGNT', how='right')\n",
" \n",
" # Suicide, geo, and HR combined:\n",
" suicide_and_geo_and_HR = suicide_and_geo.merge(hr_df,\n",
" left_on=['country',
'year'],\n",
" right_on=['Entity',
'Year'],\n",
" how='left')\n",
" \n",
" unemp_df['Year'] = unemp_df['Year'].astype(int)\n",
" combined_data = suicide_and_geo_and_HR.merge(unemp_df,\n",
" left_on=['country',
'Year'],\n",
" right_on=['Country',
'Year'],\n",
" how='left')\n",
"\n",
"\n",
" to_drop = ['SOVEREIGNT', 'Entity', 'year', 'index', 'country']\n",
" combined_data = combined_data.drop(columns=to_drop)\n",
" assert combined_data.shape == (2496, 9)\n",
" assert_equal(list(combined_data.columns), ['geometry', 'suicides/100k
pop',\n",
" 'gdp_per_capita ($)', 'HDI for
year',\n",
" 'Code', 'Year', 'HR_score',
'Country',\n",
" 'Unemployment'])\n",
" return combined_data"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "NyQfRD9hxmSh",
"colab_type": "text"
},
"source": [
"### Data Analysis\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "GzRL1ZHjCVOa",
"colab_type": "text"
},
"source": [
"Here we utilize 'lin_reg' to find R^2 values and linear regressions for
every comparison and country case. This analysis occurs across all years.\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "NkTzv_gCO5LR",
"colab_type": "code",
"colab": {}
},
"source": [
"def R_2(actual, predicted):\n",
" '''\n",
" given actual and predicted y, returns\n",
" R^2 value as np.float64\n",
" '''\n",
" SSres = actual - predicted\n",
" SStot = actual - actual.mean()\n",
" R_sq = 1 - SSres.dot(SSres) / SStot.dot(SStot)\n",
" return R_sq"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "216nHYuylj5Z",
"colab_type": "code",
"colab": {}
},
"source": [
"def lin_reg(dataframe, interest_col_name='Year', country='global'):\n",
" '''\n",
" linear regression and R^2 value according to input dataframe and\n",
" a single ooptional interest column, and country in dataframe.\n",
" Does not return a plot\n",
" when the country or column in the dataframe has no data.\n",
" Interest column name must be specified if country is specified.\n",
" '''\n",
" suicide_rate_pred = []\n",
"\n",
" if country == 'global':\n",
" relevant = dataframe[[interest_col_name,'suicides/100k pop']]\n",
" relevant= relevant.dropna()\n",
" x = relevant[interest_col_name]\n",
" suicide_rate = relevant['suicides/100k pop']\n",
"\n",
" # now we need to calculate the m and b\n",
" denominator = x.dot(x) - x.mean() * x.sum()\n",
" m = (x.dot(suicide_rate) - suicide_rate.mean() * x.sum()) /
denominator\n",
" b = ((suicide_rate.mean() * x.dot(x)) - (\n",
" x.mean() * x.dot(suicide_rate))) / denominator\n",
"\n",
" suicide_rate_pred = m * x + b\n",
"\n",
"\n",
" if country != 'global':\n",
"\n",
" df = dataframe.loc[dataframe['Country'] == country]\n",
" relevant = df[[interest_col_name,'suicides/100k pop']]\n",
" relevant = relevant.dropna()\n",
"\n",
" if len(df['suicides/100k pop']) != 0 and
len(relevant[interest_col_name]\n",
" ) != 0:\n",
" x = relevant[interest_col_name]\n",
" suicide_rate = relevant['suicides/100k pop']\n",
"\n",
" # now we need to calculate the m and b\n",
" denominator = x.dot(x) - x.mean() * x.sum()\n",
" m = (x.dot(suicide_rate) - suicide_rate.mean() * x.sum()) /
denominator\n",
" b = ((suicide_rate.mean() * x.dot(x)) - (x.mean() * x.dot(\n",
" suicide_rate))) / denominator\n",
"\n",
" suicide_rate_pred = m * x + b\n",
"\n",
" if len(suicide_rate_pred) != 0:\n",
" # Rsq value:\n",
" R_sq = R_2(suicide_rate, suicide_rate_pred)\n",
" # dot product means use of 'accurate representation'\n",
" # check by visual\n",
"\n",
" # plotted\n",
" plt.scatter(x, suicide_rate)\n",
" plt.plot(x, suicide_rate_pred, 'b')\n",
" title = 'Suicide rate v. ' + str(\n",
" interest_col_name) + ', ' + str(country)\n",
" plt.title(title)\n",
" plt.savefig('Suicide_rate_regression'.strip() + str(\n",
" interest_col_name).strip() + str(country).strip() +
'.png'.strip())\n",
" #fig = plt.figure()\n",
" #fig_name = 'Suicide_rate_regression'.strip() + str(\n",
" # interest_col_name).strip() + str(country).strip() +
'.png'.strip()\n",
" \n",
" #fig.savefig(fig_name)\n",
" \n",
" plt.show()\n",
"\n",
" print(country, interest_col_name, 'R squared:', R_sq)\n",
" print('linear correlation: suicide rate =', m, '*',\n",
" interest_col_name, '+', b)\n",
" if R_sq >= .85:\n",
" print('This is a good model! :) It explains', int(R_sq * 100),\n",
" '% of the variation in suicide rate as',
interest_col_name ,'changes.')\n",
" elif R_sq >= .6:\n",
" print('This is an okay model! It explains', int(R_sq * 100),\n",
" '% of the variation in suicide rate as', interest_col_name,
'changes.')\n",
" else:\n",
" print('This is a poor model--it explains', int(R_sq * 100),\n",
" '% of the variation in suicide rate as', interest_col_name,
'changes.')\n",
"\n"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "5XO5CYrG3tEq",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "AvvN1feWKacL",
"colab_type": "text"
},
"source": [
"Here we utilize 'map_suicide_rates' in order to create a visualization for
how suicide rates have changed across the world from 1985 to 2015, by taking
average suicide rates in each country across ten year periods. (1985 to 1995, 1995
to 2005, and 2005 to 2015)"
]
},
{
"cell_type": "code",
"metadata": {
"id": "1eZOMh8I7ca9",
"colab_type": "code",
"colab": {}
},
"source": [
"def map_suicide_rates(combined_df, geo_df):\n",
" '''\n",
" This function takes the combined dataframe and the geospatial
dataframe\n",
" as inputs and plots a figure with three subplots that visualize how
suicide\n",
" rates have changed over time across the world. The first plot
presents\n",
" '''\n",
"\n",
" fig, [ax1, ax2, ax3] = plt.subplots(3, 1,\n",
" constrained_layout=True,
figsize=(10,8))\n",
" geo_df.plot(ax=ax1, color='#CFCFCF')\n",
" geo_df.plot(ax=ax2, color='#CFCFCF')\n",
" geo_df.plot(ax=ax3, color='#CFCFCF')\n",
"\n",
" reduced_cdf = combined_df[['Year', 'Country',\n",
" 'geometry', 'suicides/100k pop']]\n",
"\n",
" over_1985 = reduced_cdf['Year'] >= 1985\n",
" under_1995 = reduced_cdf['Year'] <= 1995\n",
" over_1995 = reduced_cdf['Year'] >= 1995\n",
" under_2005 = reduced_cdf['Year'] <= 2005\n",
" over_2005 = reduced_cdf['Year'] >= 2005\n",
"\n",
" dec_1 = reduced_cdf[over_1985 & under_1995]\n",
" dec_1 = dec_1.dropna()\n",
" dec_2 = reduced_cdf[over_1995 & under_2005]\n",
" dec_2 = dec_2.dropna()\n",
" dec_3 = reduced_cdf[over_2005]\n",
" dec_3 = dec_3.dropna()\n",
"\n",
" grouped_dec1 = dec_1.dissolve(by='Country', aggfunc='mean')\n",
" assert_equal(round(grouped_dec1.loc['Albania', 'suicides/100k pop'], 2),
2.63)\n",
" assert_equal(round(grouped_dec1.loc['Russia', 'suicides/100k pop'], 2),
38.92)\n",
"\n",
" grouped_dec2 = dec_2.dissolve(by='Country', aggfunc='mean')\n",
" assert_equal(round(grouped_dec2.loc['Albania', 'suicides/100k pop'], 2),
4.29)\n",
" assert_equal(round(grouped_dec2.loc['Russia', 'suicides/100k pop'], 2),
40.92)\n",
"\n",
" grouped_dec3 = dec_3.dissolve(by='Country', aggfunc='mean')\n",
" assert_equal(round(grouped_dec3.loc['Albania', 'suicides/100k pop'], 2),
2.48)\n",
" assert_equal(round(grouped_dec3.loc['Russia', 'suicides/100k pop'], 2),
27.29)\n",
"\n",
" grouped_dec1.plot(column='suicides/100k pop', legend=True,\n",
" ax=ax1, vmin=0, vmax=50, edgecolor='#EEEEEE')\n",
" grouped_dec2.plot(column='suicides/100k pop', legend=True,\n",
" ax=ax2, vmin=0, vmax=50, edgecolor='#EEEEEE')\n",
" grouped_dec3.plot(column='suicides/100k pop', legend=True,\n",
" ax=ax3, vmin=0, vmax=50, edgecolor='#EEEEEE')\n",
"\n",
" ax1.set_title('Average Suicide Rate per Country: 1985 to 1995',
fontsize=14)\n",
" ax2.set_title('Average Suicide Rate per Country: 1995 to 2005',
fontsize=14)\n",
" ax3.set_title('Average Suicide Rate per Country: 2005 to 2015',
fontsize=14)\n",
" fig.suptitle('Number of Suicides per 100k Population Across Three
Decades',\n",
" fontsize=18)\n",
" fig.savefig('Suicide_rate_over_time.png')\n",
"\n",
" plt.show()"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "KdPNScvsWJY8",
"colab_type": "code",
"colab": {}
},
"source": [
"def plot_GDP_HR_Unemp_Suic(country, combined_df):\n",
" '''\n",
" Graphs 4 Scatterplots of GDP, Human Rights Score, Unemployment rate,
and\n",
" suicide rate vs time for a user defined country\n",
" '''\n",
"\n",
" if country not in set(combined_df['Country']):\n",
" not_na = combined_df.dropna()\n",
" random_country = random.choice(list(not_na['Country']))\n",
" print(str(country), \"is not in dataset. Try\", random_country,
'.' )\n",
" return None\n",
"\n",
" fig, [ax1, ax2, ax3, ax4] = plt.subplots(4,1, figsize=(10,15),\n",
"
constrained_layout=True)\n",
" is_country = combined_df['Country'] == country\n",
" country_df = combined_df[is_country]\n",
" GDP_df = country_df[['Year', 'gdp_per_capita ($)']]\n",
" HR_df = country_df[['Year', 'HR_score']]\n",
" Unemp_df = country_df[['Year', 'Unemployment']]\n",
" suicide_df = country_df[['Year', 'suicides/100k pop']]\n",
"\n",
" \n",
" GDP_df.plot(x='Year', y='gdp_per_capita ($)', ax=ax1,
legend=False)\n",
" HR_df.plot(x='Year', y='HR_score', ax=ax2, legend=False)\n",
" Unemp_df.plot(x='Year', y='Unemployment', ax=ax3, legend=False)\n",
" #plt.scatter(x=suicide_df['Year'], y=suicide_df['suicides/100k pop'],
c='b')\n",
" suicide_df.plot(x='Year', y='suicides/100k pop', ax=ax4,
legend=False)\n",
"\n",
" ax1.set_title('GDP per Capita vs Year in ' + str(country))\n",
" ax2.set_title('Human Rights Score vs Year in ' + str(country))\n",
" ax3.set_title('Unemployment vs Year in ' + str(country))\n",
" plt.title('Suicide Rate (per 100k population) vs Year in ' +\n",
" str(country))\n",
" fig.suptitle('Yearly Data for ' + str(country), fontsize=16)\n",
" fig.savefig('combined_country_data_ove_time.png')\n",
" plt.show()"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "QIdh-Uo41dT3",
"colab_type": "code",
"colab": {}
},
"source": [
"def accuraccy(y_true, y_pred, tolerance=1):\n",
" '''\n",
" This decides the accuracy, when given a prediction\n",
" true values, and some tolerance with default = 1\n",
" '''\n",
" correct = 0\n",
" differences = y_true-y_pred\n",
" rel = differences.mask(abs(differences) > tolerance)\n",
" rel = rel.dropna()\n",
" if rel.max() > tolerance:\n",
" print('test failed!')\n",
" \n",
" return len(rel) / len(y_true)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Njk_8wPqFoWs",
"colab_type": "code",
"colab": {}
},
"source": [
"def ML_model(data, desire='mse', tolerance=1, predict_set=None):\n",
" '''\n",
" Simple machine learning model uses scikit-learn\n",
" to predict suicide rate given country, year, GDP\n",
" per capita, unemployment, HR score, and HDI.\n",
" Returns test mean squared error as a float\n",
" to show model accuracy. Also returns an accuracy score\n",
" for prediction/reality matches within a given tolerance\n",
" where default tolerance is 1.\n",
" Specifying 'desire' lets you return mse,\n",
" the 'model' (for predictive testing),\n",
" or the 'prediction' your model gives for some properly\n",
" formatted df input.\n",
" '''\n",
" data = data.drop(columns=['geometry'])\n",
" data = data.dropna()\n",
" # get rid of NaN rows\n",
"\n",
" # So now the country names degree are strings--enter: get dummies\n",
" # I am interestded in all my info I think so:\n",
"\n",
" input_dat = data.loc[:, data.columns != 'suicides/100k pop']\n",
" dummied_dat = pd.get_dummies(input_dat)\n",
"\n",
" output = data['suicides/100k pop']\n",
"\n",
" # Now I need to set some data aside for testing\n",
" input_train, input_test, output_train, output_test =
train_test_split(\n",
" dummied_dat, output, test_size=0.3)\n",
"\n",
" if abs((len(input_test) / (len(input_test) + len(input_train))) - .
3\n",
" ) > 0.02:\n",
" print('Test failed: check test length')\n",
"\n",
" # Now, to make the tree and train it!\n",
" model = DecisionTreeRegressor()\n",
" model.fit(input_train, output_train)\n",
"\n",
" output_test_pred = model.predict(input_test)\n",
"\n",
" from sklearn.metrics import mean_squared_error\n",
"\n",
" # How good is it at predicting itself?\n",
" test_mse = mean_squared_error(output_test, output_test_pred)\n",
" acc = accuraccy(output_test, output_test_pred, tolerance)\n",
"\n",
" if predict_set != None:\n",
" if desire == 'prediction':\n",
" df_to_predict = predict_set.dropna()\n",
" dummied_dat = pd.get_dummies(df_to_predict)\n",
" return model.predict(dummied_dat)\n",
"\n",
" if desire == 'mse':\n",
" print('mse is:', test_mse)\n",
" print('accuracy score is:', acc)\n",
" return test_mse, acc \n",
"\n",
" if desire == 'model':\n",
" return model\n",
"\n",
" if desire == 'both':\n",
" print('MSE for test set is', mse)\n",
" return model\n"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "4rl7CJYZzbBV",
"colab_type": "text"
},
"source": [
"### Main Method"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "dY-FdXS89otE",
"colab_type": "text"
},
"source": [
"We really need to make an initializer for out data bc we cant have it
clean through every test "
]
},
{
"cell_type": "code",
"metadata": {
"id": "yNpD-Ia17esd",
"colab_type": "code",
"outputId": "5d61941f-6291-4519-d8e6-626db550bbb7",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 681
}
},
"source": [
"def main():\n",
" print('Cleaning the data...')\n",
" geodata = clean_country_shape('ne_110m_admin_0_countries.shp')\n",
" HR = clean_human_rights('human-rights-scores.csv')\n",
" suicide = clean_suicide_data('master.csv')\n",
" unemployment = clean_unemployment(\n",
" 'API_SL.UEM.TOTL.ZS_DS2_en_csv_v2_712954.csv')\n",
" print('merging the data...')\n",
" print()\n",
" combined_df = merge_df(suicide, geodata, HR, unemployment)\n",
" decisionM = input(\n",
" \"Would you like to see how accurate a machine learning model is?
Input Y or N \")\n",
" print()\n",
" if decisionM == 'Y':\n",
" decisionT = input(\n",
" \"Choose a tolerance for accuracy? Input a float between 0 and
10 \")\n",
" ML_model(combined_df, 'mse', float(decisionT))\n",
" print()\n",
"\n",
" #print(fit_and_predict_degrees(combined_df))\n",
" decision = input(\n",
" \"Would you like to see country specific data over time? Input Y or
N \")\n",
" if decision == 'N':\n",
" print()\n",
" print()\n",
" print(\"Okay, here's some global data!\")\n",
" lin_reg(combined_df)\n",
" print()\n",
" lin_reg(combined_df, 'gdp_per_capita ($)')\n",
" print()\n",
" lin_reg(combined_df, 'HDI for year')\n",
" print()\n",
" lin_reg(combined_df, 'HR_score')\n",
" print()\n",
" lin_reg(combined_df, 'Unemployment')\n",
" print()\n",
" map_suicide_rates(combined_df, geodata)\n",
"\n",
" if decision == 'Y':\n",
" decision1 = input(\"What country would you like to see data
for? \")\n",
" print()\n",
" print(\"Searching for\", decision1, \"data!\")\n",
" plot_GDP_HR_Unemp_Suic(\n",
" combined_df=combined_df, country=decision1)\n",
" print()\n",
" print()\n",
" lin_reg(combined_df, 'Year', country=decision1)\n",
" print()\n",
" print()\n",
" lin_reg(combined_df, 'gdp_per_capita ($)', country=decision1)\n",
" print()\n",
" print()\n",
" lin_reg(combined_df, 'HDI for year', country=decision1)\n",
" print()\n",
" print()\n",
" lin_reg(combined_df, 'Unemployment', country=decision1)\n",
" print()\n",
" print()\n",
" lin_reg(combined_df, 'HR_score', country=decision1)\n",
" print()\n",
" print()\n",
" decision_redo = input(\n",
" \"Would you like to see more information? Input Y or N \")\n",
" if decision_redo == 'N':\n",
" return 'YES'\n",
" elif decision_redo == 'Y':\n",
" return 'NO'\n",
" print()\n",
" print()\n",
" else:\n",
" print(\"I didn't get that!\")\n",
" decision_redo = input(\n",
" \"Would you like to see more information on suicide? Input Y or
N \")\n",
" if decision_redo == 'N':\n",
" return 'YES'\n",
" elif decision_redo == 'Y':\n",
" return 'NO'\n",
" print()\n",
" print()\n",
"\n",
"if __name__ == '__main__':\n",
" while main() != 'YES':\n",
" main()"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"Cleaning the data...\n",
"merging the data...\n",
"\n",
"Would you like to see how accurate a machine learning model is? Input
Y or N Y\n",
"\n",
"Choose a tolerance for accuracy? Input a float between 0 and 10 2\n",
"mse is: 27.69566245039682\n",
"accuracy score is: 0.6071428571428571\n",
"\n"
],
"name": "stdout"
},
{
"output_type": "error",
"ename": "KeyboardInterrupt",
"evalue": "ignored",
"traceback": [

"\u001b[0;31m----------------------------------------------------------------------
-----\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m
Traceback (most recent call last)",
"\u001b[0;32m/usr/local/lib/python3.6/dist-
packages/ipykernel/kernelbase.py\u001b[0m in
\u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent,
password)\u001b[0m\n\u001b[1;32m 729\u001b[0m
\u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u00
1b[0m\u001b[0m\n\u001b[0;32m--> 730\u001b[0;31m
\u001b[0mident\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreply\u001b[0m
\u001b[0;34m=\u001b[0m
\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m.\
u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;
34m.\u001b[0m\u001b[0mstdin_socket\u001b[0m\u001b[0;34m,\u001b[0m
\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b
[0m\u001b[0m\n\u001b[0m\u001b[1;32m 731\u001b[0m
\u001b[0;32mexcept\u001b[0m
\u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\
u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.6/dist-
packages/jupyter_client/session.py\u001b[0m in \u001b[0;36mrecv\u001b[0;34m(self,
socket, mode, content, copy)\u001b[0m\n\u001b[1;32m 802\u001b[0m
\u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u00
1b[0m\u001b[0m\n\u001b[0;32m--> 803\u001b[0;31m
\u001b[0mmsg_list\u001b[0m \u001b[0;34m=\u001b[0m
\u001b[0msocket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_multipart\u001b[0m\u001
b[0;34m(\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m
\u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m)\u00
1b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m
804\u001b[0m \u001b[0;32mexcept\u001b[0m
\u001b[0mzmq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mZMQError\u001b[0m
\u001b[0;32mas\u001b[0m
\u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m
\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.6/dist-
packages/zmq/sugar/socket.py\u001b[0m in
\u001b[0;36mrecv_multipart\u001b[0;34m(self, flags, copy,
track)\u001b[0m\n\u001b[1;32m 465\u001b[0m \"\"\"\n\u001b[0;32m-->
466\u001b[0;31m \u001b[0mparts\u001b[0m \u001b[0;34m=\u001b[0m
\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u00
1b[0m\u001b[0;34m(\u001b[0m\u001b[0mflags\u001b[0m\u001b[0;34m,\u001b[0m
\u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u00
1b[0m
\u001b[0mtrack\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrack\u001b[0m\u001b[0;34m)\u
001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\
u001b[0m\u001b[1;32m 467\u001b[0m \u001b[0;31m# have first part already,
only loop while more to
receive\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u00
1b[0m\n",
"\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in
\u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in
\u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in
\u001b[0;36mzmq.backend.cython.socket._recv_copy\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.6/dist-
packages/zmq/backend/cython/checkrc.pxd\u001b[0m in
\u001b[0;36mzmq.backend.cython.checkrc._check_rc\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: ",
"\nDuring handling of the above exception, another exception
occurred:\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m
Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-66-b63a1bb39c2c>\u001b[0m in
\u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 80\u001b[0m
\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 81\u001b[0m
\u001b[0;32mif\u001b[0m \u001b[0m__name__\u001b[0m \u001b[0;34m==\u001b[0m
\u001b[0;34m'__main__'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;
34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 82\u001b[0;31m \u001b[0;32mwhile\u001b[0m
\u001b[0mmain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!
=\u001b[0m
\u001b[0;34m'YES'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u
001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 83\u001b[0m
\u001b[0mmain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001
b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-66-b63a1bb39c2c>\u001b[0m in
\u001b[0;36mmain\u001b[0;34m()\u001b[0m\n\u001b[1;32m 20\u001b[0m
\u001b[0;31m#print(fit_and_predict_degrees(combined_df))\u001b[0m\u001b[0;34m\u001b
[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m
21\u001b[0m decision = input(\n\u001b[0;32m---> 22\u001b[0;31m \"Would you
like to see country specific data over time? Input Y or
N \")\n\u001b[0m\u001b[1;32m 23\u001b[0m \u001b[0;32mif\u001b[0m
\u001b[0mdecision\u001b[0m \u001b[0;34m==\u001b[0m
\u001b[0;34m'N'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u00
1b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m
\u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u00
1b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.6/dist-
packages/ipykernel/kernelbase.py\u001b[0m in
\u001b[0;36mraw_input\u001b[0;34m(self, prompt)\u001b[0m\n\u001b[1;32m
703\u001b[0m
\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_ident\u001b[0m\u001b[0
;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m
704\u001b[0m
\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_header\u001b[0m\u001b[
0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-->
705\u001b[0;31m
\u001b[0mpassword\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;
34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1
;32m 706\u001b[0m )\n\u001b[1;32m 707\u001b[0m
\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.6/dist-
packages/ipykernel/kernelbase.py\u001b[0m in
\u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent,
password)\u001b[0m\n\u001b[1;32m 733\u001b[0m
\u001b[0;32mexcept\u001b[0m
\u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001
b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 734\u001b[0m
\u001b[0;31m# re-raise KeyboardInterrupt, to truncate
traceback\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u
001b[0m\n\u001b[0;32m--> 735\u001b[0;31m \u001b[0;32mraise\u001b[0m
\u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b
[0m\n\u001b[0m\u001b[1;32m 736\u001b[0m
\u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u0
01b[0m\u001b[0m\n\u001b[1;32m 737\u001b[0m
\u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "WmUokIq2FN5H",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": 0,
"outputs": []
}
]
}

Vous aimerez peut-être aussi