From fbe46067fb6b7562d7a8686562cea49596e55e38 Mon Sep 17 00:00:00 2001
From: Russell Jarvis <rjjarvis@asu.edu>
Date: Mon, 22 Jun 2020 12:29:52 +1000
Subject: [PATCH] Delete original_distri.py

---
 original_distri.py | 396 ---------------------------------------------
 1 file changed, 396 deletions(-)
 delete mode 100644 original_distri.py

diff --git a/original_distri.py b/original_distri.py
deleted file mode 100644
index 523dd4a..0000000
--- a/original_distri.py
+++ /dev/null
@@ -1,396 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-
-# # Markdown Cell Example 
-# markdown can be readibly interleaved and dispersed between code in notebooks 
-# ## Explanation of code below
-# The histogram (x-axis) binned readability score, (y-axis) counts of papers that occupy that readability score. 
-# 
-# The histogram is initially populated exclusively by the ART corpus, but the idea was every time a new author got scraped from scholar, it would be added in, such that with each persons new search our big picture of science readability would be better informed.
-# 
-# So the histogram changes a little modestly perceptible amount with the author scrape, but three dots pertaining to the authors easiest read, hardest read, and mean read where added.
-# 
-# These used to be ```[mean-standard dev, mean, mean+standard dev]``` but there was a flaw implementing that. It could just be that the plot looked way too busy around the mean, and it was harder to look at.
-# 
-# There is an issue with getting the dots to appear in the centre of histogram bins. I was working under the assumption, that if I knew the ```[min,mean,max]```, readability scores for Rick Gerkin, I could add to them half the bin width, and dot's would be centred. That is almost correct. I forgot that these calculations are not performed on pre binned data, so the x-coordinates of ```[min,mean,max]``` need to be slightly shifted to the nearest histogram bin start first.
-# 
-# Think of it as a bit like snapping something to a grid in photoshop.
-
-# It should be easy to hack this code to run off a local machine, using sudo.
-# Set up the Environment. This is now done in requirements, and the postBuild script.
-# ```python
-# !pip install matplotlib
-# !pip install pandas
-# !pip install seaborn
-# 
-# if os.path.exists('traingDats.p?dl=0'):
-#     pass
-# 
-# else:
-#     !wget https://www.dropbox.com/s/3h12l5y2pn49c80/traingDats.p?dl=0
-#     !wget https://www.dropbox.com/s/crarli3772rf3lj/more_authors_results.p?dl=0
-#     !wget https://www.dropbox.com/s/x66zf52himmp5ox/benchmarks.p?dl=0
-# ```
-
-# In[1]:
-
-
-import warnings
-warnings.filterwarnings("ignore")
-import pickle
-import copy
-import matplotlib as mpl
-import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
-import seaborn as sns
-import os
-#import plotly as py
-
-import glob
-files = glob.glob("*.p")
-try:
-    discontents = pickle.load(open("../SComplexity/_author_specificS S Phatak.p","rb"))
-except:
-    get_ipython().system('wget https://www.dropbox.com/s/behq9h5nc0ekh3t/_author_specifics%20s%20phatak.p?dl=0')
-    discontents = pickle.load(open("../SComplexity/_author_specificS S Phatak.p?dl=0","rb"))
-        
-type(discontents[0])
-df = discontents[0]
-#print(type())
-
-
-# In[2]:
-
-
-try:
-    with open('scraped_new.p?dl=0','rb') as f:
-        texts = pickle.load(f) 
-except:
-    get_ipython().system('wget https://www.dropbox.com/s/1kc7alp79h701hx/scraped_new.p?dl=0')
-    with open('scraped_new.p?dl=0','rb') as f:
-        texts = pickle.load(f) 
-
-queries = set([t['query'] for t in texts ])
-temp = [t for t in texts if 'standard' in t.keys() and 'wikipedia' in t['link']]
-science = ['cancer','Vaccines','evolution','climate change','Transgenic','photosysnthesis','evolution','GMO']
-res = [t['standard'] for t in temp if t['query'] in science]
-#res = [t['standard'] for t in temp]# if t['query'] in science]
-
-mwp = np.mean(res)  
-abstract_wiki = {'standard':mwp}
-
-
-# In[3]:
-
-
-#!pip install matplotlib
-#!pip install pandas
-#!pip install seaborn
-
-if os.path.exists('traingDats.p?dl=0'):
-    pass
-
-else:
-    get_ipython().system('wget https://www.dropbox.com/s/3h12l5y2pn49c80/traingDats.p?dl=0')
-    get_ipython().system('wget https://www.dropbox.com/s/crarli3772rf3lj/more_authors_results.p?dl=0')
-    get_ipython().system('wget https://www.dropbox.com/s/x66zf52himmp5ox/benchmarks.p?dl=0')
-with open('traingDats.p?dl=0','rb') as f:
-        trainingDats = pickle.load(f) 
-
-bmark = pickle.load(open('benchmarks.p?dl=0','rb'))
-#bmark.append(abstract_wiki)
-#NAME,ar = pickle.load(open('more_authors_results.p?dl=0','rb'))
-#NAME = NAME[0]
-ar = discontents[2]
-np.mean(df['standard'])
-NAME = "Sayali S. Phatak"
-trainingDats.extend(bmark)
-
-
-# In[4]:
-
-
-print([b['standard'] for b in bmark])
-
-
-# In[5]:
-
-
-
-def get_heights(stats_items,histogram_content,x_sub_set):
-    vertical_postions_indexs = []
-    for i in stats_items:
-        vertical_postions_indexs.append(find_nearest(histogram_content, i))
-    bin_width_offset = (xys[1][0] - xys[0][0])/2.0
-    x_sub_set = [ i+bin_width_offset for i in x_sub_set ]
-
-
-    heights = []
-    for i in vertical_postions_indexs:
-        heights.append(xys[i][1])
-    return heights, bin_width_offset
-
-
-def find_nearest(array, value):
-    array = np.asarray(array)
-    idx = (np.abs(array - value)).argmin()
-    return idx
-
-def snap_to_grid(author_stats,bin_centers):
-    author_stats_grid = []
-    for as_ in author_stats:
-        as_ = find_nearest(bin_centers,as_)
-        author_stats_grid.append(bin_centers[as_])
-    return author_stats_grid
-
-
-# In[6]:
-
-
-max_ART = np.max([ t['standard'] for t in trainingDats ])
-publication = [ t['publication'] for t in trainingDats if t['standard'] == max_ART ]
-keys = [ t.keys() for t in trainingDats if t['standard'] == max_ART ]
-
-fname = [ t['file_name'] for t in trainingDats if t['standard'] == max_ART ]
-bmark_max_art = {'standard':max_ART}
-#max_ART
-
-
-# In[7]:
-
-
-get_ipython().run_cell_magic('capture', '', 'plt.ioff()\n\nstandard_sci = [ t[\'standard\'] for t in trainingDats ]\nar = [ t for t in ar if type(t) is type({})]\nar = [ t for t in ar if \'standard\' in t.keys()]\nxys = [ (h.get_x(),h.get_height()) for h in sns.distplot(standard_sci).patches ]\n\nx_grid = [ h.get_x() for h in sns.distplot(standard_sci).patches ]\noffset = float((x_grid[1] - x_grid[0])/2.0)\nbin_centers = [gr+offset for gr in x_grid]\n# this plot not used yet.\n\nfig = plt.figure(figsize=(10, 8), dpi=80)\nax1 = fig.add_subplot(111)#)\nmean_ = np.mean([a[\'standard\'] for a in ar])\nmin_ = np.min([a[\'standard\'] for a in ar])\nmax_ = np.max([a[\'standard\'] for a in ar])\nstd_ = np.std([a[\'standard\'] for a in ar])\nstats_items = [mean_,min_,max_]\n\ng = sns.distplot(standard_sci, label="Readability Index")\n\n\nhistogram_content = [x[0] for x in xys]\nheight_content = np.array([x[1] for x in xys])\n\nhc = np.array(histogram_content)\n\nx_sub_set=histogram_content\n\n\nother_name=str(\'Phytochromobilin C15-Z,syn - C15-E,anti isomerization: concerted or stepwise?\')\nworst_height,_ = get_heights([max_ART],hc,x_sub_set)\nmwp_height,_ = get_heights([mwp],hc,x_sub_set)\n\n#bmark_max_art\nworst_height = worst_height[0]\n#bmark_stats_items_grid = snap_to_grid(bmark_stats_items,bin_centers)\n\n#worst_distamnce = snap_to_grid(max_ART,bin_centers)\nworst_distance = snap_to_grid([max_ART],bin_centers)\nmwp_distance = snap_to_grid([mwp],bin_centers)\nx,y,z = (mwp_distance[0],mwp_height[0],str(\'mean wikipedia\'))\n\n#print(bmark)\nbmark_stats_items = list(set([ b[\'standard\'] for b in bmark ]))\nbmark_stats_items.append(x)\n#bmark_stats_items.append(max_ART)\nbmark_heights, _ = get_heights(bmark_stats_items,histogram_content,x_sub_set)\nheights, bwo = get_heights(stats_items,histogram_content,x_sub_set)\n#bmark_heights.append(worst_height)\nbmark_stats_items = [i+bwo for i in bmark_stats_items]\nmean_a = mean_\nmin_a = min_ \nmax_a = max_ \nxticks = list(range(0,45,5))\n\n#print(xticks)\nbmark_stats_items\nbox_content = [a[\'standard\'] for a in ar]')
-
-
-# In[8]:
-
-
-
-bmark_stats_items_grid = snap_to_grid(bmark_stats_items,bin_centers)
-author_stats =[i for i in [mean_,min_,max_]]
-author_stats_grid = snap_to_grid(author_stats,bin_centers)
-mean_a_grid = snap_to_grid([mean_a],bin_centers)
-x_sub_set_grid = snap_to_grid(x_sub_set,bin_centers)
-
-print(bmark_stats_items_grid)
-#print(x)
-#bmark_stats_items_grid.append(x)
-#bmark_stats_items_grid
-#bmark
-
-
-# In[9]:
-
-
-categories = [ "Readibility of Science Declining Over Time", "Post Modern Essay Generator","upgoer 5","Science of Writing","Mean Wikipedia"]#"Mean Wikipedia"]#,other_name]# "wikipedia science"]
-bmark_stats_items_grid
-
-
-# In[10]:
-
-
-recalibrate_heights,b = get_heights(author_stats_grid,hc,x_sub_set)
-
-heights[0] = np.max(recalibrate_heights)
-heights[2] = recalibrate_heights[2]
-
-
-# In[11]:
-
-
-np.max(height_content)
-heights
-
-
-# In[12]:
-
-
-bmark_heights
-print(len(bmark_heights))
-print(len(bin_centers))
-print(len(bmark_stats_items))
-
-
-# In[13]:
-
-
-print(len(bmark_stats_items_grid))
-
-
-# In[14]:
-
-
-ar = np.array(ar)
-
-
-# In[15]:
-
-
-from pylab import *
-
-
-# In[16]:
-
-
-bmark_stats_items
-
-
-# In[17]:
-
-
-#categories
-#categories.insert(3,'Mean Wikipedia Science')
-
-
-# In[18]:
-
-
-#xinterval
-x1,y1,z1 = (mwp_distance[0],mwp_height[0],str('mean wikipedia'))
-x1
-#bmark_heights[3]=y1
-
-
-# In[19]:
-
-
-set(bmark_stats_items_grid)
-import copy
-#del bmark_stats_items_grid[-2]
-#del bmark_stats_items_grid[-1]
-xinterval1 = copy.copy(bmark_stats_items_grid)
-#xinterval1.insert(3,x1)
-#xinterval1
-#del bmark_heights[-1]
-bmark_heights
-print(len(bmark_heights))
-print(len(bmark_stats_items_grid))
-
-benchmarks = pd.DataFrame({
-'benchmarks': bmark_stats_items_grid,
-    'CDF': bmark_heights
-    })
-
-
-# In[20]:
-
-
-get_ipython().run_line_magic('matplotlib', 'inline')
-
-import numpy as np
-import scipy
-import pandas as pd
-from scipy.stats import norm
-import matplotlib.pyplot as plt
-
-fig, axes = plt.subplots(figsize=(10, 10),nrows=2, ncols=1, sharex=True, dpi=100)
-
-
-g = sns.distplot(standard_sci, label="Readability Index")
-
-
-if str('data0') not in locals():
-    data0 = pd.DataFrame({
-    'mean, min, maximum': author_stats_grid,
-        'CDF': heights
-        })
-
-
-    data2 = pd.DataFrame({
-    'Standard Reading Level': mean_a_grid,
-        'CDF': np.max(height_content)
-        })
-
-
-
-
-legend_properties = {'weight':'bold','size':8}
-ax = sns.regplot(data=benchmarks, x="benchmarks", y="CDF", fit_reg=False, marker="o", color="green")
-ax = sns.regplot(data=data2, x="Standard Reading Level", y="CDF", fit_reg=False, marker="o", color="red")
-legendMain=ax.legend(labels=[str("std deviation")], prop=legend_properties,loc='upper right')
-legendSide0=ax.legend(labels=[NAME],prop=legend_properties,loc='center right')
-legendSide1=ax.legend(labels=[str('Number of Documents: '+str(len(ar)))],prop=legend_properties,loc='upper left')
-legendMain=ax.legend(labels=[str("Google scholar author relative to ART Corpus distribution. Total docs: ")+str(len(trainingDats))], prop=legend_properties,loc='upper left')
-#
-print(categories)
-x,y,z = (worst_distance[0],worst_height,other_name)
-data3 = pd.DataFrame({
-'Standard Reading Level': [x1],
-    'CDF': [y1]
-    })
-ax = sns.regplot(data=data3, x='Standard Reading Level', y="CDF", fit_reg=False, marker="o", color="green")
-
-
-axes[1] = ax = sns.regplot(data=benchmarks, x="benchmarks", y="CDF", fit_reg=False, marker="o", color="green")
-
-ax2 = plt.twiny()
-xticks = list(range(0,45,5))
-ax2.set_xticks(xticks)
-
-axes[1].set_xticks(xinterval1)
-axes[1].set_xticklabels(categories, minor=False, rotation=90)
-
-axes[1].axvline(np.mean(standard_sci), color='red', alpha=.7, linewidth=1.5)
-axes[1].set_ylabel('Probability of Document Reading Level')
-axes[1].set_xlabel('Reading Grade Level')
-
-bp_dict = axes[0].boxplot(box_content, 0, 'gD', vert=False)
-
-
-for line in bp_dict['medians']:
-    x, y = line.get_xydata()[1] # top of median line
-
-for line in bp_dict['boxes']:
-    x0, y = line.get_xydata()[0] # bottom of left line
-    axes[0].text(x0,y, str(NAME)+' Q1 ',horizontalalignment='center',verticalalignment='top',rotation=90)
-
-    x1, y = line.get_xydata()[3] # bottom of right line
-    axes[0]. text(x1,y, str(NAME)+' Q3 ',horizontalalignment='center',verticalalignment='top',rotation=90)
-
-    axes[0]. text(np.abs(x1+x0)/2,y, str(NAME)+' $\mu$ ',horizontalalignment='center',verticalalignment='top',rotation=90)
-    x2, y = line.get_xydata()[1] # bottom of right line
-axes[0].axvline(np.mean(standard_sci), color='red', alpha=.7, linewidth=1.5)
-
-
-# In[21]:
-
-
-categories
-
-
-# In[22]:
-
-
-fig, axes = plt.subplots(figsize=(10, 10),nrows=1, ncols=1, sharex=True, dpi=100)
-
-sns.violinplot(data=box_content, palette="Set2", split=True,
-             scale="count", inner="stick",orient="h")
-#axes[1]
-fig, axes = plt.subplots(figsize=(10, 10),nrows=1, ncols=1, sharex=True, dpi=100)
-
-sns.violinplot(data=standard_sci, palette="Set2", split=True,
-             scale="count", inner="stick",orient="h")
-#axes[1]
-#sns.distplot(standard_sci, label="Readability Index")
-
-    
-
-
-# In[23]:
-
-
-bmark_stats_items_grid
-
-
-# In[ ]:
-
-
-
-
-
-# In[ ]:
-
-
-
-
-- 
GitLab