From a148ad12a95d3c47156f427c6efeddddc9690c16 Mon Sep 17 00:00:00 2001
From: Russell Jarvis <colouredstatic@gmail.com>
Date: Sat, 18 Apr 2020 11:37:25 +1000
Subject: [PATCH] installable without Docker

---
 crawl.py                               | 173 +++++++++++
 enter_author_name.py                   |  30 ++
 install.sh                             |  55 ++++
 online_app_backend.py                  | 185 ++++++++++++
 plotting_author_versus_distribution.py | 215 ++++++++++++++
 scrape.py                              | 389 +++++++++++++++++++++++++
 t_analysis.py                          | 244 ++++++++++++++++
 utils.py                               | 296 +++++++++++++++++++
 8 files changed, 1587 insertions(+)
 create mode 100644 crawl.py
 create mode 100644 enter_author_name.py
 create mode 100644 install.sh
 create mode 100644 online_app_backend.py
 create mode 100644 plotting_author_versus_distribution.py
 create mode 100644 scrape.py
 create mode 100644 t_analysis.py
 create mode 100644 utils.py

diff --git a/crawl.py b/crawl.py
new file mode 100644
index 0000000..58077a0
--- /dev/null
+++ b/crawl.py
@@ -0,0 +1,173 @@
+
+## A lot of this code is informed by this multi-threading of web-grabbing example:
+# https://github.com/NikolaiT/GoogleScraper/blob/master/Examples/image_search.py
+# Probably the parallel architecture sucks, probably dask.bag mapping would be more readable and efficient.
+##
+#import threading,requests, os, urllib
+from bs4 import BeautifulSoup
+from natsort import natsorted, ns
+import glob
+import requests
+import os
+
+import selenium
+#from pyvirtualdisplay import Display
+from selenium import webdriver
+
+
+#display = Display(visible=0, size=(1024, 800))
+#display.start()
+
+
+chrome_options = webdriver.ChromeOptions()
+chrome_options.add_argument('--no-sandbox')
+chrome_options.add_argument('--headless')
+chrome_options.add_argument('--disable-gpu')
+#driver = webdriver.Chrome(chrome_options=chrome_options)
+driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver',chrome_options=chrome_options)
+
+driver.implicitly_wait(20)
+
+from selenium.common.exceptions import NoSuchElementException
+
+
+import pandas as pd
+import pycld2 as cld2
+
+
+import pdfminer
+from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfminer.pdfdevice import PDFDevice
+from pdfminer.layout import LAParams
+from pdfminer.converter import  TextConverter
+
+import re
+import numpy as np
+
+
+from bs4 import BeautifulSoup
+import bs4 as bs
+import urllib.request
+
+from delver import Crawler
+C = Crawler()
+CWD = os.getcwd()
+
+from io import StringIO
+import io
+
+
+rsrcmgr = PDFResourceManager()
+retstr = StringIO()
+laparams = LAParams()
+codec = 'utf-8'
+device = TextConverter(rsrcmgr, retstr, laparams = laparams)
+interpreter = PDFPageInterpreter(rsrcmgr, device)
+
+
+
+def convert_pdf_to_txt(content):
+    try:
+        pdf = io.BytesIO(content.content)
+    except:
+        pdf = io.BytesIO(content)
+    parser = PDFParser(pdf)
+    document = PDFDocument(parser, password=None) # this fails
+    write_text = ''
+    for page in PDFPage.create_pages(document):
+        interpreter.process_page(page)
+        write_text +=  retstr.getvalue()
+        #write_text = write_text.join(retstr.getvalue())
+    # Process all pages in the document
+    text = str(write_text)
+    return text
+
+def html_to_txt(content):
+    soup = BeautifulSoup(content, 'html.parser')
+    #strip HTML
+    for script in soup(["script", "style"]):
+        script.extract()    # rip it out
+    text = soup.get_text()
+    #organize text
+    lines = (line.strip() for line in text.splitlines())  # break into lines and remove leading and trailing space on each
+    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) # break multi-headlines into a line each
+    text = '\n'.join(chunk for chunk in chunks if chunk) # drop blank lines
+    str_text = str(text)
+    return str_text
+
+def print_best_text(fileName):
+    file = open(fileName)
+
+    return text
+
+def denver_to_text(url):
+    fileName = C.download(local_path=CWD, url=url, name='temp_file')
+    file = open(fileName)
+    if str('.html') in fileName:
+        text = html_to_txt(file)
+    else:
+        text = convert_pdf_to_txt(file)
+    file.close()
+    return text
+
+def collect_hosted_files(url):
+    '''
+    Used for scholar
+    '''
+    print(url)
+    try:
+        crude_html = denver_to_text(url)
+    except:
+        driver.get(url)
+        crude_html = driver.page_source
+    #soup0 = BeautifulSoup(crude_html, 'html.parser')
+    soup = BeautifulSoup(crude_html, 'lxml')
+    links = []
+    print(soup)
+    for link in soup.findAll('a'):check_out = link.get('href');links.append(check_out)
+        #print(link)
+    for link in soup.findAll('a', attrs={'href': re.compile("https://")}):
+        check_out = link.get('href')
+        #if '/citations?' in check_out:
+        links.append(check_out)
+    for link in soup.findAll('a', attrs={'href': re.compile("http://")}):
+        check_out = link.get('href')
+        #if '/citations?' in check_out:
+        links.append(check_out)
+
+    return links
+def collect_pubs(url):
+    '''
+    Used for scholar
+    '''
+    driver = webdriver.Firefox()
+    driver.get(url)
+    crude_html = driver.page_source
+    """
+    #print(url)
+    try:
+        crude_html = denver_to_text(url)
+    except:
+        if type(url) is type(str()):
+            print(url)
+            driver.get(url)
+        else:
+            return None
+        
+        print(crude_html)
+    """
+    soup = BeautifulSoup(crude_html, 'html.parser')
+    links = []
+    for link in soup.findAll('a', attrs={'href': re.compile("https://")}):
+        check_out = link.get('href')
+        #if '/citations?' in check_out:
+        links.append(check_out)
+    for link in soup.findAll('a', attrs={'href': re.compile("http://")}):
+        check_out = link.get('href')
+        #if '/citations?' in check_out:
+        links.append(check_out)
+
+    return links
diff --git a/enter_author_name.py b/enter_author_name.py
new file mode 100644
index 0000000..26512ec
--- /dev/null
+++ b/enter_author_name.py
@@ -0,0 +1,30 @@
+#SComplexity.t_analysis
+from SComplexity import online_app_backend
+import argparse
+'''
+parser = argparse.ArgumentParser(description='Process some authors.')
+
+parser.add_argument("-H", "--Help", help = "Example: Help argument", required = False, default = "")
+
+parser.add_argument('author','--a',metavar='N', type=str, nargs='+',required=True,help='authors first name')
+parser.add_argument('t','--t', metavar='N', type=str, nargs='+',help='boolean to select a tournment between authors')
+
+parser.add_argument('a2', metavar='N', type=str, nargs='+',help='second author',required=False)
+parser.add_argument('v','--verbose', help='Print more data',action='store_true',required=False)
+parser.add_argument('an','--anonymize', help='Anonymize loosing author or both authors in competition plot',action='store_true',default="True",required=False)
+args = parser.parse_args()
+NAME = args.author
+TOUR = args.t
+author2 = args.a2
+verbose = args.v
+anon = args.an
+print(NAME)
+'''
+TOUR = False
+if TOUR:
+    NAME1 = args.author1
+    online_app_backend.call_from_front_end(NAME,NAME1=author2,tour=TOUR,anon=anon,verbose=verbose)
+else:
+    NAME = "S S Phatak"
+    verbose = False
+    online_app_backend.call_from_front_end(NAME,verbose=verbose)
diff --git a/install.sh b/install.sh
new file mode 100644
index 0000000..4ce6fee
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,55 @@
+# https://gist.github.com/mikesmullin/2636776
+# 
+#!/bin/bash
+# download and install latest geckodriver for linux or mac.
+# required for selenium to drive a firefox browser.
+
+install_dir="/usr/local/bin"
+json=$(curl -s https://api.github.com/repos/mozilla/geckodriver/releases/latest)
+if [[ $(uname) == "Darwin" ]]; then
+    url=$(echo "$json" | jq -r '.assets[].browser_download_url | select(contains("macos"))')
+elif [[ $(uname) == "Linux" ]]; then
+    url=$(echo "$json" | jq -r '.assets[].browser_download_url | select(contains("linux64"))')
+else
+    echo "can't determine OS"
+    exit 1
+fi
+curl -s -L "$url" | tar -xz
+chmod +x geckodriver
+sudo mv geckodriver "$install_dir"
+echo "installed geckodriver binary in $install_dir"
+
+
+#!/bin/bash
+# download and install latest chromedriver for linux or mac.
+# required for selenium to drive a Chrome browser.
+
+install_dir="/usr/local/bin"
+version=$(curl -s -L -qO- https://chromedriver.storage.googleapis.com/LATEST_RELEASE)
+if [[ $(uname) == "Darwin" ]]; then
+    url=http://chromedriver.storage.googleapis.com/$version/chromedriver_mac32.zip
+    curl -s -L "$url" | tar -xz 
+    #url=https://chromedriver.storage.googleapis.com/$version/chromedriver_mac64.zip
+#elif [[ $(uname) == "Linux" ]]; then
+#    url=https://chromedriver.storage.googleapis.com/$version/chromedriver_linux64.zip
+#else
+#    echo "can't determine OS"
+#    exit 1
+#fi
+curl -s -L "$url" | tar -xz
+chmod +x chromedriver
+sudo mv chromedriver "$install_dir"
+echo "installed chromedriver binary in $install_dir"
+sudo pip install PyPDF2
+sudo pip install pycld2
+sudo pip install nltk
+sudo pip install selenium
+sudo pip install delver
+sudo pip install pdfminer
+sudo pip install pyvirtualdisplay
+sudo pip install textstat
+sudo pip install fsspec>=0.3.3
+sudo pip install textblob
+sudo pip install twython
+python3 -c "import nltk;nltk.download('punkt')"
+python3 -c "nltk.download('stopwords')"
diff --git a/online_app_backend.py b/online_app_backend.py
new file mode 100644
index 0000000..ac78425
--- /dev/null
+++ b/online_app_backend.py
@@ -0,0 +1,185 @@
+import copy
+import matplotlib.pyplot as plt
+import seaborn as sns
+#plt.backend("")
+import os.path
+import pdb
+import pickle
+from collections import OrderedDict
+
+import IPython.display as d
+import numpy as np
+import pandas as pd
+from bs4 import BeautifulSoup
+
+from crawl import collect_pubs#, collect_hosted_files
+from get_bmark_corpus import process
+from t_analysis import text_proc
+# Put these results, in a data frame, then in Markdown, using RGerkin's code.
+# https://gist.github.com/rgerkin/af5b27a0e30531c30f2bf628aa41a553
+# !pip install --user tabulate # Install the tabulate package
+from tabulate import tabulate
+from t_analysis import text_proc, perplexity, unigram_zipf
+
+
+
+def metricss(rg):
+    if isinstance(rg,list):
+        pub_count = len(rg)
+        mean_standard = np.mean([ r['standard'] for r in rg if 'standard' in r.keys()])
+        return mean_standard
+    else:
+        return None
+def metricsp(rg):
+    if isinstance(rg,list):
+        pub_count = len(rg)
+        penalty = np.mean([ r['penalty'] for r in rg if 'penalty' in r.keys()])
+        penalty = np.mean([ r['perplexity'] for r in rg if 'perplexity' in r.keys() ])
+
+        return penalty
+    else:
+        return None
+
+def filter_empty(the_list):
+    the_list = [ tl for tl in the_list if tl is not None ]
+    the_list = [ tl for tl in the_list if type(tl) is not type(str('')) ]
+
+    return [ tl for tl in the_list if 'standard' in tl.keys() ]
+
+
+def take_url_from_gui(author_link_scholar_link_list):
+    '''
+    inputs a URL that's full of publication orientated links, preferably the
+    authors scholar page.
+    '''
+    author_results = []
+    follow_links = collect_pubs(author_link_scholar_link_list)[0:5]
+    for r in follow_links:
+       try:
+           urlDat = process(r)
+
+       except:
+           follow_more_links = collect_pubs(r)
+           for r in follow_more_links:
+               urlDat = process(r)
+       print(urlDat)
+        
+       if not isinstance(urlDat,type(None)):
+           author_results.append(urlDat)
+
+       # print(author_results[-1])
+       #with open('new.p','wb') as f:
+       #    pickle.dump(author_results,f)
+    return author_results
+
+def unigram_model(author_results):
+    '''
+    takes author results.
+    '''
+    terms = []
+    for k,v in author_results.items():
+        try:
+            #author_results_r[k] = list(s for s in v.values()  )
+            author_results[k]['files'] = list(s for s in v.values()  )
+
+            words = [ ws['tokens'] for ws in author_results[k]['files'] if ws is not None ]
+            author_results[k]['words'] = words
+            terms.extend(words)# if isinstance(terms,dict) ]
+        except:
+            print(terms[-1])
+    big_model = unigram(terms)
+    with open('author_results_processed.p','wb') as file:
+        pickle.dump(author_results,file)
+    with open('big_model_science.p','wb') as file:
+        pickle.dump(list(big_model),file)
+
+    return big_model
+
+def info_models(author_results):
+    big_model = unigram_model(author_results)
+    compete_results = {}
+    for k,v in author_results.items():
+        per_dpc = []
+        try:
+            for doc in author_results[k]['words']:
+                per_doc.append(perplexity(doc, big_model))
+        except:
+            pass
+        compete_results[k] = np.mean(per_doc)
+        author_results[k]['perplexity'] = compete_results[k]
+    return author_results, compete_results
+
+
+
+def update_web_form(url):
+    print(url)
+    #data = author_results = {}
+    author_results = take_url_from_gui(url)
+    ar =  copy.copy(author_results)
+    #data[name] = author_results
+    #for k,v in author_results.items():
+    datax = filter_empty(ar)
+    datay = metricss(ar)
+    print(datay)
+    df = pd.DataFrame(datax)
+    print(df)
+    return df, datay, author_results
+# Optionally give the dataframe's index a name
+#df.index.name = "my_index"
+# Create the markdown string
+
+def enter_name_here(scholar_page, name):
+    df, datay, author_results = update_web_form(scholar_page)
+    #author_results
+    '''
+    md = tabulate(df, headers='keys', tablefmt='pipe')
+    # Fix the markdown string; it will not render with an empty first table cell,
+    # so if the dataframe's index has no name, just place an 'x' there.
+    md = md.replace('|    |','| %s |' % (df.index.name if df.index.name else 'x'))
+    # Create the Markdown object
+    result = d.Markdown(md)
+    '''
+    return df, datay, author_results
+
+def find_nearest(array, value):
+    array = np.asarray(array)
+    idx = (np.abs(array - value)).argmin()
+    return idx
+
+def ar_manipulation(ar):
+    ar = [ tl for tl in ar if tl is not None ]
+    ar = [ tl for tl in ar if type(tl) is not type(str('')) ]
+    ar = [ tl for tl in ar if 'standard' in tl.keys() ]
+
+    #with open(str('more_authors_results.p'),'wb') as f:
+    #    pickle.dump([NAME,ar],f)
+
+    with open('traingDats.p','rb') as f:
+        trainingDats = pickle.load(f)
+        
+    trainingDats.extend(ar)
+    return (ar, trainingDats)
+
+def call_from_front_end(NAME,tour=None,NAME1=None,verbose=False):
+    if type(tour) is type(None):
+        scholar_link=str('https://scholar.google.com/scholar?hl=en&as_sdt=0%2C3&q=')+str(NAME)
+        df, datay, ar  = enter_name_here(scholar_link,NAME)
+        
+        with open('_author_specific'+str(NAME)+'.p','wb') as f: pickle.dump([NAME,ar,df,datay,scholar_link],f)
+
+        
+        (ar, trainingDats) = ar_manipulation(ar)
+        with open('traingDats.p','wb') as f:
+            pickle.dump(trainingDats,f)
+        import plotting_author_versus_distribution
+        return ar
+
+    else:
+        scholar_link=str('https://scholar.google.com/scholar?hl=en&as_sdt=0%2C3&q=')+str(NAME)
+        df, datay, ar  = enter_name_here(scholar_link,NAME)
+        (ar0, trainingDats) = ar_manipulation(ar)
+        scholar_link=str('https://scholar.google.com/scholar?hl=en&as_sdt=0%2C3&q=')+str(NAME1)
+        df, datay, ar  = enter_name_here(scholar_link,NAME1)
+        (ar1, trainingDats) = ar_manipulation(ar)
+        import plotting_author_versus_distribution
+        return [ar0,ar1]
diff --git a/plotting_author_versus_distribution.py b/plotting_author_versus_distribution.py
new file mode 100644
index 0000000..c758cd6
--- /dev/null
+++ b/plotting_author_versus_distribution.py
@@ -0,0 +1,215 @@
+import pickle
+import copy
+import matplotlib as mpl
+import numpy as np
+import pandas as pd
+mpl.use("Agg")
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+
+def find_nearest(array, value):
+    array = np.asarray(array)
+    idx = (np.abs(array - value)).argmin()
+    return idx
+
+bmark = pickle.load(open('benchmarks.p','rb'))
+NAME,ar = pickle.load(open('more_authors_results.p','rb'))
+#print(ar)
+print(bmark)
+import pdb; pdb.set_trace()
+import os
+
+if os.path.exists('traingDats.p'):
+    with open('traingDats.p','rb') as f:
+        trainingDats = pickle.load(f)
+else:
+    from ..Examples import use_training
+    with open('traingDats.p','rb') as f:
+        trainingDats = pickle.load(f)
+
+def other_files():
+    NAME = str('J. Bryan Henderson')
+
+    brian = pickle.load(open('ben_results.p','rb'))
+    ar1 = brian.T.to_dict().values()
+
+    arp = pickle.load(open('author_results_processed.p','rb'))
+    arp.pop('rgerkin',None)
+
+    #import pdb; pdb.set_trace()
+    for NAME in arp.keys():
+        ar = []
+        for i in range(0,len(arp[NAME].values())):
+            ar.append(list(arp[NAME].values())[i])
+            traingDats.extend(ar)
+
+standard_sci = [ t['standard'] for t in trainingDats ]
+ar = [ t for t in ar if type(t) is type({})]
+ar = [ t for t in ar if 'standard' in t.keys()]
+#print(ar)
+xys = [ (h.get_x(),h.get_height()) for h in sns.distplot(standard_sci).patches ]
+# this plot not used yet.
+
+fig = plt.figure()
+ax1 = fig.add_subplot(111)
+#print(ar)
+mean_ = np.mean([a['standard'] for a in ar])
+min_ = np.min([a['standard'] for a in ar])
+max_ = np.max([a['standard'] for a in ar])
+std_ = np.std([a['standard'] for a in ar])
+stats_items = [mean_,min_,max_]
+
+#import pdb
+#pdb.set_trace()
+g = sns.distplot(standard_sci, label="Readability Index")
+
+
+histogram_content = [x[0] for x in xys]
+height_content = np.array([x[1] for x in xys])
+
+hc = np.array(histogram_content)
+
+# code for plotting std deviation.
+sub_set = np.where((histogram_content>=mean_-std_) & (histogram_content<=mean_+std_))
+x_sub_set = np.array(histogram_content)[sub_set]
+
+std_plot_ind = height_content[sub_set]
+sub_set = sub_set[0].tolist()
+
+assert len(sub_set) < len(histogram_content)
+
+#vertical_postions = map()
+def get_heights(stats_items,histogram_content,x_sub_set):
+    vertical_postions_indexs = []
+    for i in stats_items:
+        vertical_postions_indexs.append(find_nearest(histogram_content, i))
+    bin_width_offset = (xys[1][0] - xys[0][0])/2.0
+    x_sub_set = [ i+bin_width_offset for i in x_sub_set ]
+
+
+    heights = []
+    for i in vertical_postions_indexs:
+        heights.append(xys[i][1])
+    return heights, bin_width_offset
+
+bmark_stats_items = [ b['standard'] for b in bmark ]
+categories = [ "upgoer 5", "Readibility Declining Over Time","Science of Writing","Post Modern Essay Generator","G Nicholas"]
+#categories = [b['link'] for b in bmark]
+bmark_heights, bwo = get_heights(bmark_stats_items,histogram_content,x_sub_set)
+heights, bwo = get_heights(stats_items,histogram_content,x_sub_set)
+bmark_stats_items = [i+bwo for i in bmark_stats_items]
+sub_set = [i+bwo for i in sub_set]
+print(heights)
+mean_a = mean_# + bin_width_offset
+min_a = min_ #+ bin_width_offset
+max_a = max_ #+ bin_width_offset
+#std_a = mean_ + bin_width_offset
+index = find_nearest(histogram_content, mean_)
+#mean_link = histogram_content[index]['link']
+index = find_nearest(histogram_content, min_)
+#min_link = histogram_content[index]['link']
+index = find_nearest(histogram_content, max_)
+#max_link = histogram_content[index]['link']
+
+#print(bmarks)
+#import pdb; pdb.set_trace()
+
+#bmark_stats_items = [ b['standard'] for b in bmark ]
+#bmark_heights = get_heights(bmark_stats_items,histogram_content,x_sub_set)
+
+
+benchmarks = pd.DataFrame({
+'benchmarks': bmark_stats_items,
+    'CDF': bmark_heights
+    })
+
+author_stats =[i+bwo for i in [mean_,min_,max_]]
+data0 = pd.DataFrame({
+'mean, min, maximum': author_stats,
+    'CDF': heights
+    })
+
+
+data2 = pd.DataFrame({
+'Standard Reading Level': [mean_a+bwo],
+    'CDF': [heights[0]]
+    })
+
+
+data1 = pd.DataFrame({
+'Standard Reading Level': x_sub_set,
+    'CDF': std_plot_ind
+    })
+
+legend_properties = {'weight':'bold','size':8}
+
+ax = sns.regplot(data=benchmarks, x="benchmarks", y="CDF", fit_reg=False, marker="o", color="green")
+
+
+#bbox_props = dict(boxstyle="rarrow", fc=(0.8,0.9,0.9), ec="b", lw=2)
+
+#t = ax.text(0, 0, "Direction", ha="center", va="center", rotation=90,
+#            size=15,
+#            bbox=bbox_props)
+#import pdb; pdb.set_trace()
+#bmark_heights.reverse()
+for i in bmark_heights:
+    print(i)
+#import pdb
+#pdb.set_trace()
+cnt=0
+for i,j,k in zip(bmark_stats_items[0:-1],bmark_heights[0:-1],categories):
+    print(i,j,k)
+    if cnt==1:
+        j=j+0.02
+        ax.text(i+bwo,j,k, rotation=0)
+
+    else:
+        #j=j+0.1
+        print(j)
+        ax.text(i,j,k, rotation=0)
+    cnt +=1
+cnt = 0
+for i,j,k in zip(author_stats,heights,[str(NAME)+' mean',str(NAME)+' min',str(NAME)+' max']):
+    print(i,j,k)
+    #if cnt==3:
+    #    ax.text(i+bwo,j,k)
+
+    #else:
+    #j=j+0.15
+    print(j)
+    ax.text(i,j,k, rotation=0)
+    cnt +=1
+
+ax = sns.regplot(data=data0, x="mean, min, maximum", y="CDF", fit_reg=False, marker="o", color="blue")
+#ax = sns.regplot(x='Standard Reading Level', y='CDF',data=data1, fit_reg=False, marker="o", color="green")#, data=fmri)
+ax = sns.regplot(data=data2, x="Standard Reading Level", y="CDF", fit_reg=False, marker="o", color="red")
+
+legendMain=ax.legend(labels=[str("std deviation")], prop=legend_properties,loc='upper right')
+
+legendSide0=ax.legend(labels=[NAME],prop=legend_properties,loc='center right')
+legendSide1=ax.legend(labels=[str('Number of Documents: '+str(len(ar)))],prop=legend_properties,loc='upper left')
+#print(len(ar))
+
+
+legendMain=ax.legend(labels=[str("ART Corpus+ other scholar authors")], prop=legend_properties,loc='upper right')
+ax.add_artist(legendMain)
+ax.add_artist(legendSide0)
+ax.add_artist(legendSide1)
+#g=sns.clustermap(corrmat, vmax=.8, square=True)
+print(ax.get_xticklabels())
+rotation = 90
+#for i, ax in enumerate(legendMain.fig.axes):   ## getting all axes of the fig object
+#     ax.set_xticklabels([a['link'] for a in ar], rotation = rotation)
+
+
+#g.fig.show()
+
+
+locs, labels = plt.xticks()
+plt.setp(labels, rotation=45)
+#print(NAME)
+#import pdb; pdb.set_trace()
+plt.savefig(str(NAME)+'_author_readability.png')
+plt.show()
diff --git a/scrape.py b/scrape.py
new file mode 100644
index 0000000..1a3cd76
--- /dev/null
+++ b/scrape.py
@@ -0,0 +1,389 @@
+# Scientific readability project
+# authors: other authors,
+# ...,
+# Russell Jarvis
+# https://github.com/russelljjarvis/
+# rjjarvis@asu.edu
+
+# Patrick McGurrin
+# patrick.mcgurrin@gmail.com
+from numpy import random
+import os
+from bs4 import BeautifulSoup
+import pickle
+import _pickle as cPickle #Using cPickle will result in performance gains
+#from GoogleScraper import scrape_with_config, GoogleSearchError
+import dask.bag as db
+
+import pdfminer
+from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfminer.pdfdevice import PDFDevice
+from pdfminer.layout import LAParams
+from pdfminer.converter import  TextConverter
+
+from crawl import convert_pdf_to_txt
+from crawl import print_best_text
+from crawl import collect_pubs
+import scholar_scrape.scholar as scholar
+#scholar = scholar_scrape.scholar
+
+from delver import Crawler
+C = Crawler()
+import requests
+
+
+import io
+
+import selenium
+
+from selenium import webdriver
+#from pyvirtualdisplay import Display
+
+#display = Display(visible=0, size=(1024, 768))
+#display.start()
+
+
+from selenium.webdriver.firefox.options import Options
+
+import re
+from bs4 import BeautifulSoup
+import bs4 as bs
+import urllib.request
+from io import StringIO
+import io
+
+
+#options = Options()
+#options.headless = True
+chrome_options = webdriver.ChromeOptions()
+chrome_options.add_argument('--no-sandbox')
+chrome_options.add_argument('--headless')
+chrome_options.add_argument('--disable-gpu')
+driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver',chrome_options=chrome_options)
+#driver = webdriver.Chrome(chrome_options=chrome_options)
+driver.implicitly_wait(10)
+from selenium.common.exceptions import NoSuchElementException
+
+
+
+rsrcmgr = PDFResourceManager()
+retstr = StringIO()
+laparams = LAParams()
+codec = 'utf-8'
+device = TextConverter(rsrcmgr, retstr, laparams = laparams)
+interpreter = PDFPageInterpreter(rsrcmgr, device)
+
+
+
+
+#from pyPdf import PdfFileReader
+
+#from StringIO import StringIO
+
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfminer.converter import TextConverter
+from pdfminer.layout import LAParams
+from pdfminer.pdfpage import PDFPage
+import os
+import sys, getopt
+from io import StringIO
+
+#converts pdf, returns its text content as a string
+def pdf_to_txt_(infile):#, pages=None):
+    #if not pages:
+    #pagenums = set()
+
+    output = StringIO()
+    manager = PDFResourceManager()
+    converter = TextConverter(manager, output, laparams=LAParams())
+    interpreter = PDFPageInterpreter(manager, converter)
+
+    #infile = file(fname, 'rb')
+    for page in PDFPage.get_pages(infile, pagenums):
+        interpreter.process_page(page)
+    infile.close()
+    converter.close()
+    text = output.getvalue()
+    output.close
+    return text
+
+
+import PyPDF2
+from PyPDF2 import PdfFileReader
+
+#mport textract
+
+#from nltk.tokenize import word_tokenize
+#from nltk.corpus import stopwords
+
+def pdf_to_txt(url):
+
+    if str(content) == str('<Response [404]>'):
+        return None
+    else:
+        # from
+        # https://medium.com/@rqaiserr/how-to-convert-pdfs-into-searchable-key-words-with-python-85aab86c544f
+        try:
+            input_buffer = StringIO(content.content)
+            pdfReader = PyPDF2.PdfFileReader(input_buffer)
+        except:
+            pdfFileObj = io.BytesIO(content.content)
+            pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
+
+        num_pages = pdfReader.numPages
+        count = 0
+        text = ""
+        while count < num_pages:
+            pageObj = pdfReader.getPage(count)
+            count +=1
+            text += pageObj.extractText()
+        if text != "":
+           text = text
+        else:
+           text = textract.process(fileurl, method='tesseract', language='eng')
+    return text
+    '''
+    parser = PDFParser(pdf)
+    document = PDFDocument(parser, password=None)
+    write_text = ''
+    for page in PDFPage.create_pages(document):
+        interpreter.process_page(page)
+        write_text = write_text.join(retstr.getvalue())
+
+    text = str(write_text)
+    '''
+
+def html_to_txt(content):
+    soup = BeautifulSoup(content, 'html.parser')
+    #strip HTML
+
+    for script in soup(["script", "style"]):
+        script.extract()    # rip it out
+    text = soup.get_text()
+    wt = copy.copy(text)
+    #organize text
+    lines = (line.strip() for line in text.splitlines())  # break into lines and remove leading and trailing space on each
+    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) # break multi-headlines into a line each
+    text = '\n'.join(chunk for chunk in chunks if chunk) # drop blank lines
+    str_text = str(text)
+
+    return str_text
+
+def convert(content,link):
+    # This is really ugly, but it's proven to be both fault tolerant and effective.
+    try:
+        if str('.html') in link:
+            text = html_to_txt(content)
+            print(text)
+
+        elif str('.pdf') in link:
+            text = pdf_to_txt(content)
+        else:
+            try:
+                text = html_to_txt(content)
+                print(text)
+            except:
+                text = None
+    except:
+        text = None
+    return text
+
+
+def url_to_text(link_tuple):
+    se_b, page_rank, link, category, buff = link_tuple
+    if str('pdf') not in link:
+        if C.open(link) is not None:
+            content = C.open(link).content
+            buff = convert(content,link)
+        else:
+            print('problem')
+    else:
+        pdf_file = requests.get(link, stream=True)
+        f = io.BytesIO(pdf_file.content)
+        reader = PdfFileReader(f)
+        buff = reader.getPage(0).extractText().split('\n')
+
+
+    print(buff)
+    link_tuple = ( se_b, page_rank, link, category, buff )
+    return link_tuple
+
+#@jit
+def buffer_to_pickle(link_tuple):
+    se_b, page_rank, link, category, buff = link_tuple
+    link_tuple = se_b, page_rank, link, category, buff
+    fname = 'results_dir/{0}_{1}_{2}.p'.format(category,se_b,page_rank)
+    if type(buff) is not None:
+        with open(fname,'wb') as f:
+            pickle.dump(link_tuple,f)
+    return
+
+def process(item):
+    text = url_to_text(item)
+    buffer_to_pickle(text)
+    return
+
+
+# this should not be hard coded, it should be set in the class init, but can't be bothered refactoring.
+NUM_LINKS = 10
+
+
+
+# this should be a class method with self and self.NUM_LINKS but can't be bothered refactoring.
+def wiki_get(get_links):
+    # wikipedia is robot friendly
+    # surfraw is fine.
+    se_,index,link,category,buff = get_links
+    url_of_links = str('https://en.wikipedia.org/w/index.php?search=')+str(category)
+    links = collect_pubs(url_of_links)
+    if len(links) > NUM_LINKS: links = links[0:NUM_LINKS]
+    [ process((se_,index,l,category,buff)) for index,l in enumerate(links) ]
+
+# this should be a class method with self and self.NUM_LINKS but can't be bothered refactoring.
+
+def scholar_pedia_get(get_links):
+    # wikipedia is robot friendly
+    # surfraw is fine.
+    se_,index,link,category,buff = get_links
+    url_of_links = str('http://www.scholarpedia.org/w/index.php?search=')+str(category)+str('&title=Special%3ASearch')
+    links = collect_pubs(url_of_links)
+    if len(links) > NUM_LINKS: links = links[0:NUM_LINKS]
+    [ process((se_,index,l,category,buff)) for index,l in enumerate(links) ]
+
+# this should be a class method with self and self.NUM_LINKS but can't be bothered refactoring.
+def search_scholar(get_links):
+    # from https://github.com/ckreibich/scholar.py/issues/80
+    se_,index,category,category,buff = get_links
+    querier = scholar.ScholarQuerier()
+    settings = scholar.ScholarSettings()
+    querier.apply_settings(settings)
+    query = scholar.SearchScholarQuery()
+
+    query.set_words(category)
+    querier.send_query(query)
+    links = [ a.attrs['url'][0] for a in querier.articles if a.attrs['url'][0] is not None ]
+    #links = query.get_url()
+    #print(links)
+    #if len(links) > NUM_LINKS: links = links[0:NUM_LINKS]
+
+    [ process((se_,index,l,category,buff)) for index,l in enumerate(links) ]
+
+def search_author(get_links):
+    # from https://github.com/ckreibich/scholar.py/issues/80
+    se_,index,category,category,buff = get_links
+    querier = scholar.ScholarQuerier()
+    settings = scholar.ScholarSettings()
+    querier.apply_settings(settings)
+    query = scholar.SearchScholarQuery()
+
+    query.set_words(category)
+    querier.send_query(query)
+    links = [ a.attrs['url'][0] for a in querier.articles if a.attrs['url'][0] is not None ]
+    #links = query.get_url()
+    #print(links)
+    #if len(links) > NUM_LINKS: links = links[0:NUM_LINKS]
+
+    [ process((se_,index,l,category,buff)) for index,l in enumerate(links) ]
+
+class SW(object):
+    def __init__(self,sengines,sterms,nlinks=10):
+        self.NUM_LINKS = nlinks
+        self.links = None
+        if not os.path.exists('results_dir'):
+            os.makedirs('results_dir')
+        self.iterable = [ (v,category) for category in sterms for v in sengines.values() ]
+        random.shuffle(self.iterable)
+
+    def slat_(self,config):
+        try:
+            if str('wiki') in config['search_engines']:
+                get_links = (str('wikipedia'),0,None,config['keyword'],None)
+                wiki_get(get_links)
+
+            elif str('info_wars') in config['search_engines']:
+                get_links = (str('info_wars'),0,None,config['keyword'],None)
+                info_wars_get(get_links)
+
+            elif str('scholar') in config['search_engines']:
+                get_links = (str('scholar'),0,None,config['keyword'],None)
+                search_scholar(get_links)
+
+            elif str('scholarpedia') in config['search_engines']:
+                get_links = (str('scholar'),0,None,config['keyword'],None)
+                scholar_pedia_get(get_links)
+
+            else:
+                search = scrape_with_config(config)
+                links = []
+                for serp in search.serps:
+                    print(serp)
+                    links.extend([link.link for link in serp.links])
+
+                # This code block jumps over gate two
+                # The (possibly private, or hosted server as a gatekeeper).
+                if len(links) > self.NUM_LINKS: links = links[0:self.NUM_LINKS]
+                if len(links) > 0:
+                    print(links)
+                    buffer = None
+                    se_ = config['search_engines']
+                    category = config['keyword']
+                    get_links = ((se_,index,link,category,buffer) for index, link in enumerate(links) )
+                    for gl in get_links:
+                        process(gl)
+                    # map over the function in parallel since it's 2018
+                    #b = db.from_sequence(get_links,npartitions=8)
+                    #_ = list(b.map(process).compute())
+        except GoogleSearchError as e:
+            print(e)
+            return None
+        print('done scraping')
+
+    #@jit
+    def scrapelandtext(self,fi):
+        se_,category = fi
+        config = {}
+        #driver = rotate_profiles()
+        # This code block, jumps over gate one (the search engine as a gatekeeper)
+        # google scholar or wikipedia is not supported by google scraper
+        # duckduckgo bang expansion _cannot_ be used as to access engines that GS does not support
+        # without significant development. Redirection to the right search results does occur,
+        # but google scrape also has tools for reading links out of web pages, and it needs to know
+        # which brand of SE to expect in order to deal with idiosyncratic formatting.
+        # it's easier not to use bang expansion, for that reason.
+        # for example twitter etc
+
+        config['keyword'] = str(category)
+
+
+        config['search_engines'] = se_
+        #config['scrape_method'] = 'http'
+
+        config['scrape_method'] = 'selenium'
+        config['num_pages_for_keyword'] = 1
+        config['use_own_ip'] = True
+        config['sel_browser'] = 'chrome'
+        config['do_caching'] = False # bloat warning.
+
+        # Google scrap + selenium implements a lot of human centric browser masquarading tools.
+        # Search Engine: 'who are you?' code: 'I am an honest human centric browser, and certainly note a robot surfing in the nude'. Search Engine: 'good, here are some pages'.
+        # Time elapses and the reality is exposed just like in 'the Emperors New Clothes'.
+        # The file crawl.py contains methods for crawling the scrapped links.
+        # For this reason, a subsequent action, c.download (crawl download ) is ncessary.
+
+        config['output_filename'] = '{0}_{1}.csv'.format(category,se_)
+
+        self.slat_(config)
+        return
+
+    def run(self):
+        # someone should write a unit_test.
+        # one reason I have not, is I would want to use travis.cl, and scrapping probably violates policies.
+        # a unit test might begin like this:
+        # self.iterable.insert(0,("scholar"),str("arbitrary test")))
+        # self.iterable.insert(0,("wiki"),str("arbitrary test")))
+
+        _ = list(map(self.scrapelandtext,self.iterable))
+        return
diff --git a/t_analysis.py b/t_analysis.py
new file mode 100644
index 0000000..edf6a15
--- /dev/null
+++ b/t_analysis.py
@@ -0,0 +1,244 @@
+# Scientific readability project
+# authors ...,
+# Russell Jarvis
+# https://github.com/russelljjarvis/
+# rjjarvis@asu.edu
+# Patrick McGurrin
+# patrick.mcgurrin@gmail.com
+
+
+import base64
+import copy
+import math
+import os
+import pickle
+import re
+import sys
+import time
+import collections
+
+import matplotlib  # Its not that this file is responsible for doing plotting, but it calls many modules that are, such that it needs to pre-empt
+matplotlib.use('Agg')
+
+import numpy as np
+import pandas as pd
+from nltk import pos_tag, sent_tokenize, word_tokenize
+from nltk.classify import NaiveBayesClassifier
+from nltk.corpus import cmudict, stopwords, subjectivity
+from nltk.probability import FreqDist
+from nltk.sentiment import SentimentAnalyzer
+from nltk.tag.perceptron import PerceptronTagger
+import nltk
+# english_check
+from utils import (black_string, clue_links, clue_words,
+                               comp_ratio, publication_check)
+from tabulate import tabulate
+from textblob import TextBlob
+from textstat.textstat import textstat
+tagger = PerceptronTagger(load=False)
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import re
+import seaborn as sns
+
+def unigram_zipf(tokens):
+    '''
+    Get the zipf slope histogram for a corpus
+    '''
+    model = collections.defaultdict(lambda: 0.01)
+    tokens = [ term for t in tokens for term in t ]
+    model = {}
+
+    for word in tokens:
+        count = model.get(word,0)
+        model[word] = count + 1
+    '''
+    normalize observations relative to number of words in the model
+    '''
+    for word in model:
+        model[word] = model[word]/float(sum(model.values()))
+    return model
+    
+    
+#    https://github.com/nltk/nltk/blob/model/nltk/model/ngram.py
+
+def entropy(self, text):
+    """
+    https://github.com/nltk/nltk/blob/model/nltk/model/ngram.py
+    Calculate the approximate cross-entropy of the n-gram model for a
+    given evaluation text.
+    This is the average log probability of each word in the text.
+    :param text: words to use for evaluation
+    :type text: Iterable[str]
+    """
+
+    normed_text = (self._check_against_vocab(word) for word in text)
+    H = 0.0     # entropy is conventionally denoted by "H"
+    processed_ngrams = 0
+    for ngram in self.ngram_counter.to_ngrams(normed_text):
+        context, word = tuple(ngram[:-1]), ngram[-1]
+        H += self.logscore(word, context)
+        processed_ngrams += 1
+    return - (H / processed_ngrams)
+
+def perplexity(self, text):
+    """
+    Calculates the perplexity of the given text.
+    This is simply 2 ** cross-entropy for the text.
+    :param text: words to calculate perplexity of
+    :type text: Iterable[str]
+    """
+
+    return pow(2.0, self.entropy(text))   
+
+
+def zipf_plot(tokens):
+    # https://www.kaggle.com/kaitlyn/zipf-s-law
+    df = pd.DataFrame(tokens,columns='text')
+    df['clean_text'] = df.text.apply(lambda x: re.sub('[^A-Za-z\']', ' ', x.lower()))
+    # Create a word count dataframe
+    word_list = ' '.join(df.clean_text.values).split(' ')
+    words = pd.DataFrame(word_list, columns=['word'])
+    word_counts = words.word.value_counts().reset_index()
+    word_counts.columns = ['word', 'n']
+    word_counts['word_rank'] = word_counts.n.rank(ascending=False)    
+    f, ax = plt.subplots(figsize=(7, 7))
+    ax.set(xscale="log", yscale="log")
+    sns.regplot("n", "word_rank", word_counts, ax=ax, scatter_kws={"s": 100})
+    return
+
+
+def perplexity(testset, model):
+    # https://stackoverflow.com/questions/33266956/nltk-package-to-estimate-the-unigram-perplexity
+    perplexity = 1
+    N = 0
+    for word in testset:
+        N += 1
+        perplexity = perplexity + (1.0/model[word])
+    return perplexity
+
+def bi_log_value(value):
+    # Bi-symmetric log-like transformation, from:
+    # http://iopscience.iop.org/article/10.1088/0957-0233/24/2/027001/pdf
+    trans = np.sign(value)*np.log(1+np.abs(value*2.302585))
+    return trans
+    #df[col] = trans
+
+
+DEBUG = False
+#from numba import jit
+
+# word limit smaller than 1000 gets product/merchandise sites.
+def text_proc(corpus, urlDat = {}, WORD_LIM = 100):
+
+    #remove unreadable characters
+    if type(corpus) is str and str('privacy policy') not in corpus:
+        corpus = corpus.replace("-", " ") #remove characters that nltk can't read
+        textNum = re.findall(r'\d', corpus) #locate numbers that nltk cannot see to analyze
+        tokens = word_tokenize(corpus)
+
+        stop_words = stopwords.words('english')
+        #We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations.
+
+        tokens = [ word for word in tokens if not word in stop_words]
+        tokens = [ w.lower() for w in tokens ] #make everything lower case
+
+        # the kind of change that might break everything
+        urlDat['wcount'] = textstat.lexicon_count(str(tokens))
+        word_lim = bool(urlDat['wcount']  > WORD_LIM)
+
+        ## Remove the search term from the tokens somehow.
+        urlDat['tokens'] = tokens
+
+        if 'big_model' in urlDat.keys():
+            urlDat['perplexity'] = perplexity(corpus, urlDat['big_model'])
+        else:
+            urlDat['perplexity'] = None
+        # Word limits can be used to filter out product merchandise websites, which otherwise dominate scraped results.
+        # Search engine business model is revenue orientated, so most links will be for merchandise.
+
+        urlDat['publication'] = publication_check(str(tokens))[1]
+        urlDat['clue_words'] = clue_words(str(tokens))[1]
+        if str('link') in urlDat.keys():
+            urlDat['clue_links'] = clue_links(urlDat['link'])[1]
+
+            temp = len(urlDat['clue_words'])+len(urlDat['publication'])+len(urlDat['clue_links'])
+            if temp  > 10 and str('wiki') not in urlDat['link']:
+                urlDat['science'] = True
+            else:
+                urlDat['science'] = False
+            if str('wiki') in urlDat['link']:
+                urlDat['wiki'] = True
+            else:
+                urlDat['wiki'] = False
+        # The post modern essay generator is so obfuscated, that ENGLISH classification fails, and this criteria needs to be relaxed.
+        not_empty = bool(len(tokens) != 0)
+
+        if not_empty and word_lim: #  and server_error:
+
+            tokens = [ w.lower() for w in tokens if w.isalpha() ]
+            #fdist = FreqDist(tokens) #frequency distribution of words only
+            # The larger the ratio of unqiue words to repeated words the more colourful the language.
+            lexicon = textstat.lexicon_count(corpus, True)
+            urlDat['uniqueness'] = len(set(tokens))/float(len(tokens))
+            # It's harder to have a good unique ratio in a long document, as 'and', 'the' and 'a', will dominate.
+            # big deltas mean redudancy/sparse information/information/density
+
+
+            urlDat['info_density'] =  comp_ratio(corpus)
+
+            #Sentiment and Subjectivity analysis
+            testimonial = TextBlob(corpus)
+            urlDat['sp'] = testimonial.sentiment.polarity
+            urlDat['ss'] = testimonial.sentiment.subjectivity
+            urlDat['sp_norm'] = np.abs(testimonial.sentiment.polarity)
+            urlDat['ss_norm'] = np.abs(testimonial.sentiment.subjectivity)
+            urlDat['gf'] = textstat.gunning_fog(corpus)
+
+            # explanation of metrics
+            # https://github.com/shivam5992/textstat
+
+            urlDat['standard'] = textstat.text_standard(corpus, float_output=True)
+            #urlDat['standard_'] = copy.copy(urlDat['standard'] )
+            # special sauce
+            # Good writing should be readable, objective, concise.
+            # The writing should be articulate/expressive enough not to have to repeat phrases,
+            # thereby seeming redundant. Articulate expressive writing then employs
+            # many unique words, and does not yield high compression savings.
+            # Good writing should not be obfucstated either. The reading level is a check for obfucstation.
+            # The resulting metric is a balance of concision, low obfucstation, expression.
+
+            wc = float(1.0/urlDat['wcount'])
+            # compressed/uncompressed. Smaller is better.
+            # as it means writing was low entropy, redundant, and easily compressible.
+            urlDat['scaled'] = wc * urlDat['standard']
+            urlDat['conciseness'] = urlDat['wcount']*(urlDat['uniqueness']) + \
+            urlDat['wcount']*(urlDat['info_density'])
+
+            urlDat['conciseness'] = bi_log_value(urlDat['conciseness'])
+            if urlDat['perplexity'] is not None:
+                urlDat['perplexity'] = bi_log_value(urlDat['perplexity'])
+
+                penalty = (urlDat['standard'] + urlDat['conciseness']+\
+                urlDat['scaled'] + urlDat['perplexity'])/4.0
+            else:
+                penalty = (urlDat['standard'] + urlDat['conciseness']+urlDat['scaled'] )/3.0
+
+            #computes perplexity of the unigram model on a testset
+            urlDat['penalty'] = penalty
+
+        return urlDat
+
+
+def process_dics(urlDats):
+    dfs = []
+    for urlDat in urlDats:
+        # pandas Data frames are best data container for maths/stats, but steep learning curve.
+        # Other exclusion criteria. Exclude reading levels above grade 100,
+        # as this is most likely a problem with the metric algorithm, and or rubbish data in.
+        # TODO: speed everything up, by performing exclusion criteri above not here.
+        if len(dfs) == 0:
+            dfs = pd.DataFrame(pd.Series(urlDat)).T
+        dfs = pd.concat([ dfs, pd.DataFrame(pd.Series(urlDat)).T ])
+    return dfs
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..454d360
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,296 @@
+# Scientific readability project
+# authors: other_authors
+# Russell Jarvis
+# https://github.com/russelljjarvis/
+# rjjarvis@asu.edu
+
+# Patrick McGurrin
+# patrick.mcgurrin@gmail.com
+
+import os
+
+import pycld2 as cld2
+import lzma
+
+
+def isPassive(sentence):
+    # https://github.com/flycrane01/nltk-passive-voice-detector-for-English/blob/master/Passive-voice.py
+    beforms = ['am', 'is', 'are', 'been', 'was', 'were', 'be', 'being']               # all forms of "be"
+    aux = ['do', 'did', 'does', 'have', 'has', 'had']                                  # NLTK tags "do" and "have" as verbs, which can be misleading in the following section.
+    words = nltk.word_tokenize(sentence)
+    tokens = nltk.pos_tag(words)
+    tags = [i[1] for i in tokens]
+    if tags.count('VBN') == 0:                                                            # no PP, no passive voice.
+        return False
+    elif tags.count('VBN') == 1 and 'been' in words:                                    # one PP "been", still no passive voice.
+        return False
+    else:
+        pos = [i for i in range(len(tags)) if tags[i] == 'VBN' and words[i] != 'been']  # gather all the PPs that are not "been".
+        for end in pos:
+            chunk = tags[:end]
+            start = 0
+            for i in range(len(chunk), 0, -1):
+                last = chunk.pop()
+                if last == 'NN' or last == 'PRP':
+                    start = i                                                             # get the chunk between PP and the previous NN or PRP (which in most cases are subjects)
+                    break
+            sentchunk = words[start:end]
+            tagschunk = tags[start:end]
+            verbspos = [i for i in range(len(tagschunk)) if tagschunk[i].startswith('V')] # get all the verbs in between
+            if verbspos != []:                                                            # if there are no verbs in between, it's not passive
+                for i in verbspos:
+                    if sentchunk[i].lower() not in beforms and sentchunk[i].lower() not in aux:  # check if they are all forms of "be" or auxiliaries such as "do" or "have".
+                        break
+                else:
+                    return True
+    return False
+
+
+
+
+def argument_density(sentence0,sentence1):
+    # https://github.com/flycrane01/nltk-passive-voice-detector-for-English/blob/master/Passive-voice.py
+    CLAIMS = ['I think that', 'I believe that']               # all forms of "be"
+    CAUSAL = ['because','so','thus','therefore','since']                                  # NLTK tags "do" and "have" as verbs, which can be misleading in the following section.
+    terms = nltk.word_tokenize(sentence1)
+    #tokens = nltk.pos_tag(terms)
+    befores = []
+    for t in terms:
+        if t in CAUSAL:
+            befores.append(sentence0)
+    return befores
+
+
+
+        #for C in CAUSAL:
+        #    if
+
+    '''
+    tags = [i[1] for i in tokens]
+    if tags.count('VBN') == 0:                                                            # no PP, no passive voice.
+        return False
+    elif tags.count('VBN') == 1:                                    # one PP "been", still no passive voice.
+        return False
+    else:
+        pos = [i for i in range(len(tags)) if tags[i] == 'VBN' and words[i] != 'been']  # gather all the PPs that are not "been".
+        for end in pos:
+            chunk = tags[:end]
+            start = 0
+            for i in range(len(chunk), 0, -1):
+                last = chunk.pop()
+                if last == 'NN' or last == 'PRP':
+                    start = i                                                             # get the chunk between PP and the previous NN or PRP (which in most cases are subjects)
+                    break
+            sentchunk = words[start:end]
+            tagschunk = tags[start:end]
+            verbspos = [i for i in range(len(tagschunk)) if tagschunk[i].startswith('V')] # get all the verbs in between
+            if verbspos != []:                                                            # if there are no verbs in between, it's not passive
+                for i in verbspos:
+                    if sentchunk[i].lower() not in beforms and sentchunk[i].lower() not in aux:  # check if they are all forms of "be" or auxiliaries such as "do" or "have".
+                        break
+                else:
+                    return True
+    return False
+    '''
+
+def convert_pdf_to_txt(content):
+    pdf = io.BytesIO(content.content)
+    parser = PDFParser(pdf)
+    document = PDFDocument(parser, password=None) # this fails
+    write_text = ''
+    for page in PDFPage.create_pages(document):
+        interpreter.process_page(page)
+        write_text +=  retstr.getvalue()
+        #write_text = write_text.join(retstr.getvalue())
+    # Process all pages in the document
+    text = str(write_text)
+    return text
+
+def html_to_txt(content):
+    soup = BeautifulSoup(content, 'html.parser')
+    #strip HTML
+    for script in soup(["script", "style"]):
+        script.extract()    # rip it out
+    text = soup.get_text()
+    #organize text
+    lines = (line.strip() for line in text.splitlines())  # break into lines and remove leading and trailing space on each
+    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) # break multi-headlines into a line each
+    text = '\n'.join(chunk for chunk in chunks if chunk) # drop blank lines
+    str_text = str(text)
+    return str_text
+
+def comp_ratio(test_string):
+    # If we are agnostic about what the symbols are, and we just observe the relative frequency of each symbol.
+    # The distribution of frequencies would make some texts harder to compress, even if we don't know what the symbols mean.
+    # http://www.beamreach.org/data/101/Science/processing/Nora/Papers/Information%20entropy%20o%20fjumpback%20whale%20songs.pdf
+
+    c = lzma.LZMACompressor()
+    bytes_in = bytes(test_string,'utf-8')
+    bytes_out = c.compress(bytes_in)
+    return len(bytes_out)/len(bytes_in)
+
+def english_check(corpus):
+
+    # It's not that we are cultural imperialists, but the people at textstat, and nltk may have been,
+    # so we are also forced into this tacit agreement.
+    # Japanese characters massively distort information theory estimates, as they are potentially very concise.
+    _, _, details = cld2.detect(' '.join(corpus), bestEffort=True)
+    detectedLangName, _ = details[0][:2]
+    return bool(detectedLangName == 'ENGLISH')
+
+
+
+
+def engine_dict_list():
+    se = {0:"google",1:"yahoo",2:"duckduckgo",3:"wikipedia",4:"scholar",5:"bing"}
+    return se, list(se.values())
+
+def search_params():
+    SEARCHLIST = ["autosomes","respiration", "bacteriophage",'Neutron','Vaccine','Transgenic','GMO','Genetically Modified Organism','neuromorphic hardware', 'mustang unicorn', 'scrook rgerkin neuron', 'prancercise philosophy', 'play dough delicious deserts']
+    _, ses = engine_dict_list()
+    WEB = len(ses) #how many search engines to include (many possible- google google scholar bing yahoo)
+    LINKSTOGET= 10 #number of links to pull from each search engine (this can be any value, but more processing with higher number)
+    return SEARCHLIST, WEB, LINKSTOGET
+
+def search_known_corpus():
+    '''
+    hardcoded links to get. journal seek is a data base of known academic journals.
+    '''
+    LINKSTOGET = []
+    PUBLISHERS = str('https://journalseek.net/publishers.htm')
+    LINKSTOGET.append(str('https://academic.oup.com/beheco/article-abstract/29/1/264/4677340'))
+    LINKSTOGET.append(str('http://splasho.com/upgoer5/library.php'))
+    LINKSTOGET.append(str('https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvMjc3MjUvZWxpZmUtMjc3MjUtdjIucGRm/elife-27725-v2.pdf?_hash=WA%2Fey48HnQ4FpVd6bc0xCTZPXjE5ralhFP2TaMBMp1c%3D'))
+    LINKSTOGET.append(str('https://scholar.google.com/scholar?hl=en&as_sdt=0%2C3&q=Patrick+mcgurrin+ASU&btnG='))
+    LINKSTOGET.append(str('https://scholar.google.com/citations?user=GzG5kRAAAAAJ&hl=en&oi=sra'))
+    LINKSTOGET.append(str('https://scholar.google.com/citations?user=xnsDhO4AAAAJ&hl=en&oe=ASCII&oi=sra'))
+    LINKSTOGET.append(str('https://scholar.google.com/citations?user=2agHNksAAAAJ&hl=en&oi=sra'))
+    #_, ses = engine_dict_list()
+    WEB = 1
+    #WEB = len(ses) #how many search engines to include (many possible- google google scholar bing yahoo)
+    LINKSTOGET= 10 #number of links to pull from each search engine (this can be any value, but more processing with higher number)
+    return PUBLISHERS, WEB, LINKSTOGET
+
+
+
+
+
+def clue_links(check_with):
+    '''
+    The goal of this function/string comparison
+    is just to give a clue, about whether the text is
+    an official scientific publication, or a blog, or psuedo science publication
+    It is not meant to act as a definitive classifier.
+    '''
+    # TODO query with pyhthon crossref api
+
+    # https://pypi.org/project/crossrefapi/1.0.7/
+    CHECKS = [str('.fda'),str('.epa'),str('.gov'),str('.org'),str('.nih'),str('.nasa'),str('.pdf')]
+    assume_false = []
+    for check in CHECKS:
+        if check in check_with:
+            assume_false.append(check)
+    if len(assume_false) == 1:
+        return (True, assume_false)
+    else:
+        return (False, assume_false)
+
+
+def publication_check(wt):
+    '''
+    The goal of this function/string comparison
+    is just to give a clue, about whether the text is
+    an official scientific publication, or a blog, or psuedo science publication
+    It is not meant to act as a definitive classifier.
+    '''
+    publication = {}
+    if 'issn' in wt:
+        publication['issn'] = wt.split("issn",1)[1]
+    if 'isbn' in wt:
+        publication['isbn'] = wt.split("isbn",1)[1]
+    if 'pmid' in wt:
+        publication['pmid'] = wt.split("pmid",1)[1]
+    for k,v in publication.items():
+        publication[k] = v[0:15]
+
+    if len(publication) >= 1:
+        return (True, publication)
+    else:
+        return (False, publication)
+
+
+def clue_words(check_with):
+    '''
+    The goal of this function/string comparison
+    is just to give a clue, about whether the text is an official scientific publication, or a blog, or psuedo science publication
+    It is not meant to act as a definitive classifier.
+    To get ISSN (for any format) there is a national center in each country.
+    It may be National Library in some cases. List of National ISSN centers are listed in issn website.
+    For DOI, there are representatives in western countries, also you can apply to doi.org or crossref.org.
+    How are the e-ISSN Number, DOI and abbreviation provided for a new journal ?.
+    Available from: https://www.researchgate.net/post/How_are_the_e-ISSN_Number_DOI_and_abbreviation_provided_for_a_new_journal [accessed Apr 7, 2015].
+    '''
+    #TODO query with pyhthon crossref api
+    # https://pypi.org/project/crossrefapi/1.0.7/
+    # check_with should be lower cse now
+
+    CHECKS = [str('isbn'),str("issn"),str("doi"),str('volume'),str('issue'), \
+    str("journal of"),str("abstract"),str("materials and methods"),str("nature"), \
+    str("conflict of interest"), str("objectives"), str("significance"), \
+    str("published"), str("references"), str("acknowledgements"), str("authors"), str("hypothesis"), \
+    str("nih"),str('article'),str('affiliations'),str('et al')]
+    assume_false = []
+    for check in CHECKS:
+        if check in check_with:
+            assume_false.append(check)
+    if len(assume_false) >= 6:
+        return (True, assume_false)
+    else:
+        return (False, assume_false)
+
+
+def argument_density(check_with):
+    density_histogram = {}
+    # https://github.com/flycrane01/nltk-passive-voice-detector-for-English/blob/master/Passive-voice.py
+    CLAIMS = ['I think that', 'I believe that']               # all forms of "be"
+    CAUSAL = ['because','so','thus','therefore','since']                                  # NLTK tags "do" and "have" as verbs, which can be misleading in the following section.
+    for c in CLAIMS:
+        if c in check_with:
+            density_histogram[c] += 1
+    for c in CAUSAL:
+        if c in check_with:
+            density_histogram[c] += 1
+    return density_histogram
+
+
+
+def black_string(check_with):
+    if len(check_with) == 1145:
+        return True
+    #check="Privacy_policy"
+    #if check in check_with:
+    #    return True
+    check="Our systems have detected unusual traffic from your computer network.\\nThis page checks to see if it\'s really you sending the requests, and not a robot.\\nWhy did this happen?\\nThis page appears when Google automatically detects requests coming from your computer network which appear to be in violation of the Terms of Service. The block will expire shortly after those requests stop.\\nIn the meantime, solving the above CAPTCHA will let you continue to use our services.This traffic may have been sent by malicious software, a browser plug in, or a script that sends automated requests.\\nIf you share your network connection, ask your administrator for help  a different computer using the same IP address may be responsible.\\nLearn moreSometimes you may be asked to solve the CAPTCHA if you are using advanced terms that robots are known to use, or sending requests very quickly."
+    if check in check_with:
+        return True
+    check="Google ScholarLoading...The system can't perform the operation now."
+    if check in check_with:
+        return True
+    check="Please show you're not a robotSorry, we can't verify that you're not a robot when"
+    if check in check_with:
+        return True
+    check=" JavaScript is turned off.Please enable JavaScript in your browser and reload this page.HelpPrivacyTerms"
+    if check in check_with:
+        return True
+    if check in check_with:
+        return True
+    check = "\\x00\\x00\\x00\\x00"
+    if check in check_with:
+        return True
+    check = "Please click here if you are not redirected within a few seconds."
+    if check in check_with:
+        return True
+    check="DuckDuckGo  Privacy, simplified.\\nAbout DuckDuckGo\\nDuck it!\\nThe search engine that doesn\'t track you.\\nLearn More."
+    if check in check_with:
+        return True
+    return False
-- 
GitLab