From b15adc3f34be25e29b0d767ae85e385ef17ee706 Mon Sep 17 00:00:00 2001
From: Russell Jarvis <russelljarvis@protonmail.com>
Date: Thu, 11 Mar 2021 14:49:13 +1100
Subject: [PATCH] update

---
 science_access/enter_author_name.py      |  19 +-
 science_access/online_app_backend.py     |  50 ++-
 science_access/t_analysis.py             | 515 ++++++++++++-----------
 science_access/word_cloud_by_word_len.py |  19 +-
 4 files changed, 313 insertions(+), 290 deletions(-)

diff --git a/science_access/enter_author_name.py b/science_access/enter_author_name.py
index 2cab4ff..9a9ea82 100644
--- a/science_access/enter_author_name.py
+++ b/science_access/enter_author_name.py
@@ -110,7 +110,7 @@ def zipf_plot(word_counts_fz):
 
 
 # @st.cache
-def art_cloud_wl(acorpus:str=""):
+def art_cloud_wl(acorpus: str = ""):
     WC = WordCloud(background_color="white")
     WC.generate_from_lengths = MethodType(generate_from_lengths, WC)
     fig = plt.figure()
@@ -121,7 +121,7 @@ def art_cloud_wl(acorpus:str=""):
     if type(acorpus) is type(""):
         wordcloud = WC.generate_from_lengths(acorpus)
 
-    if not 'wordcloud' in locals():
+    if not "wordcloud" in locals():
         return None, None, None
     biggest_words = WC.biggest_words
 
@@ -142,7 +142,7 @@ def zipf_wrapper(acorpus):
 # @st.cache
 
 
-def art_cloud(acorpus:str=""):
+def art_cloud(acorpus: str = ""):
 
     # Generate a word cloud image
     WC = WordCloud(background_color="white")
@@ -158,11 +158,12 @@ def art_cloud(acorpus:str=""):
     return wordcloud, fig, plt
 
 
-def fast_art_cloud(acorpus:str=""):
+def fast_art_cloud(acorpus: str = ""):
     wordcloud, fig, plt = art_cloud(acorpus)
     st.pyplot(fig)
     return fig
 
+
 def create_giant_strings(ar, not_want_list):
     sci_corpus = ""
     first_pass = []
@@ -327,11 +328,13 @@ def grand_distribution_plot(ar, scraped_labels, standard_sci, df0, author_name="
     fig.update_layout(width=900, height=600)  # , hovermode='x')
     return df1, fig
 
-from typing import List,Any
+
+from typing import List, Any
 import pandas as pd
-#import streamlit as st
-#List
-def push_frame_to_screen(contents:Any, readability_vector:List)->pd.DataFrame():
+
+# import streamlit as st
+# List
+def push_frame_to_screen(contents: Any, readability_vector: List) -> pd.DataFrame():
     if type(contents) is type(list()):
         df_links = pd.DataFrame()
         df_links["Web_Link"] = pd.Series(contents)
diff --git a/science_access/online_app_backend.py b/science_access/online_app_backend.py
index 0c6d6ff..1783f87 100644
--- a/science_access/online_app_backend.py
+++ b/science_access/online_app_backend.py
@@ -2,8 +2,9 @@ from typing import List
 import PyPDF2
 from pathlib import Path
 import copy
-#import matplotlib.pyplot as plt
-#import seaborn as sns
+
+# import matplotlib.pyplot as plt
+# import seaborn as sns
 import semanticscholar as sch
 
 import os.path
@@ -11,7 +12,7 @@ import pdb
 import pickle
 from collections import OrderedDict
 
-#import IPython.display as d
+# import IPython.display as d
 import numpy as np
 import pandas as pd
 from bs4 import BeautifulSoup
@@ -188,6 +189,7 @@ def author_to_urls(NAME):
             dois.append(li[1])
     return dois, coauthors, titles, visit_urls
 
+
 def visit_link(NAME, tns, more_links):
     """
     inputs a URL that's full of publication orientated links, preferably the
@@ -209,6 +211,7 @@ def visit_link(NAME, tns, more_links):
 
     return author_results, visit_urls
 
+
 def visit_semantic_scholar_abstracts(NAME, tns, more_links):
     """
     inputs a URL that's full of publication orientated links, preferably the
@@ -219,18 +222,18 @@ def visit_semantic_scholar_abstracts(NAME, tns, more_links):
     aliases = None
     dois, coauthors, titles, visit_urls = author_to_urls(NAME)
     for d in dois:
-        paper =  sch.paper(d, timeout=6)
+        paper = sch.paper(d, timeout=6)
         urlDat = {}
-        urlDat["link"] = paper['url']
+        urlDat["link"] = paper["url"]
         urlDat["semantic"] = True
         if aliases is None:
             try:
-                aliases = get_aliases_and_papers(paper,NAME)
+                aliases = get_aliases_and_papers(paper, NAME)
                 urlDat["aliases"] = aliases
-                print(urlDat["aliases"],'aliases')
+                print(urlDat["aliases"], "aliases")
             except:
                 pass
-        urlDat = text_proc(str(paper['abstract']), urlDat)
+        urlDat = text_proc(str(paper["abstract"]), urlDat)
         author_results.append(urlDat)
     author_results = [
         urlDat for urlDat in author_results if not isinstance(urlDat, type(None))
@@ -239,13 +242,15 @@ def visit_semantic_scholar_abstracts(NAME, tns, more_links):
     return author_results, visit_urls
 
 
-def get_aliases_and_papers(paper,NAME):
-    if 'authors' in paper.keys():
-        for author_ in paper['authors']:
+def get_aliases_and_papers(paper, NAME):
+    if "authors" in paper.keys():
+        for author_ in paper["authors"]:
             if NAME in author_:
-                if 'aliases' in author_.keys():
-                    aliases = author_['aliases']
+                if "aliases" in author_.keys():
+                    aliases = author_["aliases"]
     return aliases
+
+
 def visit_link_unpaywall(NAME, tns, visit_urls):
     """
     inputs a URL that's full of publication orientated links, preferably the
@@ -303,7 +308,6 @@ def unpaywall_semantic_links(NAME, tns):
         r0 = str("https://api.semanticscholar.org/") + str(doi_)
         visit_more_urls.append(r0)
 
-
         r = (
             str("https://api.unpaywall.org/v2/")
             + str(doi_)
@@ -327,7 +331,8 @@ def unpaywall_semantic_links(NAME, tns):
             visit_more_urls.append(res)
     return visit_more_urls
 
-def convert_pdf_to_txt(content,verbose=False):
+
+def convert_pdf_to_txt(content, verbose=False):
     # https://github.com/allenai/science-parse/blob/master/server/README.md
     # os.subprocess(curl -v -H "Content-type: application/pdf" --data-binary @paper.pdf "http://scienceparse.allenai.org/v1")
     try:
@@ -340,11 +345,11 @@ def convert_pdf_to_txt(content,verbose=False):
         write_text = ""
         for page in PDFPage.create_pages(document):
             interpreter.process_page(page)
-            write_text += " "+retstr.getvalue()+" "
+            write_text += " " + retstr.getvalue() + " "
         # Process all pages in the document
         text = str(write_text)
-        mean_word_len = np.mean([ len(t) for t in text ])
-        if mean_word_len>33:
+        mean_word_len = np.mean([len(t) for t in text])
+        if mean_word_len > 33:
             return str("")
 
         if verbose:
@@ -410,7 +415,9 @@ def process(link, driver):  # , REDIRECT=False):
 
 def update_web_form(NAME, tns):
     more_links = unpaywall_semantic_links(NAME, tns)
-    author_results_temp, visit_urls_temp = visit_semantic_scholar_abstracts(NAME, tns, more_links)
+    author_results_temp, visit_urls_temp = visit_semantic_scholar_abstracts(
+        NAME, tns, more_links
+    )
     author_results, visit_urls = visit_link(NAME, tns, more_links)
     author_results.extend(author_results_temp)
     ar = copy.copy(author_results)
@@ -431,7 +438,7 @@ def find_nearest(array, value):
     return idx
 
 
-def ar_manipulation(ar:List=[]):
+def ar_manipulation(ar: List = []):
     ar = [tl for tl in ar if tl is not None]
     ar = [tl for tl in ar if type(tl) is not type(str(""))]
     ar = [tl for tl in ar if "standard" in tl.keys()]
@@ -443,11 +450,12 @@ def ar_manipulation(ar:List=[]):
     return (ar, trainingDats)
 
 
-def call_from_front_end(NAME:str="", OPENACCESS:bool=True, tns:int=16):
+def call_from_front_end(NAME: str = "", OPENACCESS: bool = True, tns: int = 16):
     df, datay, ar = update_web_form(NAME, tns)
     (ar, trainingDats) = ar_manipulation(ar)
     return ar
 
+
 def metricss(rg):
     if isinstance(rg, list):
         pub_count = len(rg)
diff --git a/science_access/t_analysis.py b/science_access/t_analysis.py
index 54ae8ab..c3f3688 100644
--- a/science_access/t_analysis.py
+++ b/science_access/t_analysis.py
@@ -25,7 +25,8 @@ from nltk.sentiment import SentimentAnalyzer
 from nltk.tag.perceptron import PerceptronTagger
 import nltk
 from nltk.corpus import words as english_words
-#from nltk.tokenize import word_tokenize
+
+# from nltk.tokenize import word_tokenize
 
 from nltk.tokenize import sent_tokenize, word_tokenize
 import numpy as np
@@ -46,262 +47,274 @@ from science_access.readabilityFunctions import countWordsSentSyl, NDC, FRE
 
 tagger = PerceptronTagger(load=False)
 not_want_list = [
-	"article",
-	"articlepubmedpubmed",
-	"et",
-	"al",
-	"text",
-	"crossref",
-	"isigoogle",
-	"cross",
-	"ref",
-	"google",
-	"scholar",
-	"article",
-	"pubmed",
-	"full",
-	"doi",
-	"org",
-	"http",
-	"copyright",
-	"org",
-	"figure",
-	"pubmed",
-	"accessshoping",
-	"articlepubmedpubmed",
-	"author",
+    "article",
+    "articlepubmedpubmed",
+    "et",
+    "al",
+    "text",
+    "crossref",
+    "isigoogle",
+    "cross",
+    "ref",
+    "google",
+    "scholar",
+    "article",
+    "pubmed",
+    "full",
+    "doi",
+    "org",
+    "http",
+    "copyright",
+    "org",
+    "figure",
+    "pubmed",
+    "accessshoping",
+    "articlepubmedpubmed",
+    "author",
 ]
-not_want_list.extend(["link","librarian","issue","abstract","science","cookie","publication"])
+not_want_list.extend(
+    ["link", "librarian", "issue", "abstract", "science", "cookie", "publication"]
+)
 
 
 def create_giant_strings(ar, not_want_list):
-	sci_corpus = ""
-	first_pass = []
-	for t in ar:
-		if "tokens" in t.keys():
-			for s in t["tokens"]:
-				if s not in not_want_list:
-					first_pass.append(s)
-	first_pass = set(first_pass)
-	for s in first_pass:
-		if "/" in s:
-			temp = s.split("/")  # , " ")
-			sci_corpus += str(" ") + temp[0]
-			sci_corpus += str(" ") + temp[1]
-		if "." in s:
-			temp = s.split(".")  # , " ")
-			sci_corpus += str(" ") + temp[0]
-			sci_corpus += str(" ") + temp[1]
-		if s not in set(not_want_list):
-			sci_corpus += str(" ") + s  # +str(' ')
-	return sci_corpus
-
-
-#ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words("english"))
+    sci_corpus = ""
+    first_pass = []
+    for t in ar:
+        if "tokens" in t.keys():
+            for s in t["tokens"]:
+                if s not in not_want_list:
+                    first_pass.append(s)
+    first_pass = set(first_pass)
+    for s in first_pass:
+        if "/" in s:
+            temp = s.split("/")  # , " ")
+            sci_corpus += str(" ") + temp[0]
+            sci_corpus += str(" ") + temp[1]
+        if "." in s:
+            temp = s.split(".")  # , " ")
+            sci_corpus += str(" ") + temp[0]
+            sci_corpus += str(" ") + temp[1]
+        if s not in set(not_want_list):
+            sci_corpus += str(" ") + s  # +str(' ')
+    return sci_corpus
+
+
+# ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words("english"))
 
 from typing import Union
-def complexityAlongtheText(text:str, chunk_length:int=5)->Union[float,float,str]:
-	words = sent_tokenize(text)
-	#words = #text.split()
-	cur = 0
-	stds = []
-	hardest_chunk_index = 0
-	while cur < len(words):
-		sub = words[cur : cur + 5]
-		sub_text = " ".join(sub)
-		std = textstat.text_standard(sub_text, float_output=True)
-		cur += chunk_length
-		if std>hardest_chunk_index:
-			hardest_chunk_index = cur
-		stds.append(std)
-	hard_snippet = words[hardest_chunk_index : hardest_chunk_index + chunk_length]
-	hs=""
-	for h in hard_snippet:
-		hs+=h+str(" ")
-	#st.text(hs)
-	return np.mean(stds), textstat.text_standard(text, float_output=True), hs
-
-def freeAlongtheText(text:str, chunk_length:int=5)->float:
-	#words = text.split()
-	words = sent_tokenize(text)
-
-	cur = 0
-	stds = []
-	fres = []
-	while cur < len(words):
-		sub = words[cur : cur + chunk_length]
-		sub_text = " ".join(sub)
-		wc, sc, sylCount, remainingText, wordLen = countWordsSentSyl(
-			sub_text, ignoreSingleSentences=1
-		)
-		try:
-			fre = FRE(wc, sc, sylCount)
-			fres.append(fre)
-		except:
-			pass
-		cur += chunk_length
-	return np.mean(fres)
-
-
-def get_ref(references:str):
-	for nubmer, line in enumerate(references, 1):  # skip last element with page number
-		line = line.strip()
-		if line:  # skip empty line
-			authors_and_year = re.match("((.*)\. (\d{4})\.)", line)
-			if type(authors_and_year) is not type(None):
-				text, authors, year = authors_and_year.groups()
-				names = re.split(",[ ]*and |,[ ]*| and ", authors)
-				names = [(name, name.split(" ")[-1]) for name in names]
-
-def text_proc(corpus, urlDat={}, WORD_LIM=40,verbose=False):
-	if type(corpus) is type(str()) and corpus not in str(
-		"Redirecting"
-	):  # and not str("privacy policy") in corpus:
-
-
-		if str("some error has occurred while processing your request") in corpus:
-			return {}
-		if str("We apologize for the inconvenience...") in corpus:
-			return {}
-		if np.mean([ len(w) for w in corpus ])>35:
-			return {}
-
-		corpus = corpus.replace("/", " ")  # remove characters that nltk can't read
-		corpus = corpus.lower()
-		corpus = corpus.replace(u"\xa0", u" ")
-		corpus = corpus.replace(u"\\", u" ")
-
-		if "abstract" in corpus[0:250]:
-			posa = corpus.lower().find("abstract ")
-			corpus = corpus[posa:]
-		else:
-			posa = False
-
-		if "references" in corpus:
-			posr = corpus.lower().find("references ")
-			corpus = corpus[:posr]
-		else:
-			posr = False
-		if "bibliography" in corpus:
-			posb = corpus.lower().find("bibliography ")
-			corpus = corpus[:posb]
-		else:
-			posb = False
-		if "significance" in corpus:
-			poss = corpus.lower().find("significance ")
-			corpus = corpus[poss:]
-		else:
-			poss = False;
-		if "purpose" in corpus[0:250]:
-			posp = corpus.lower().find("purpose")
-			corpus = corpus[:posp]
-		else:
-			posp = False
-
-		if (posa and (posb or posr)) or poss and posp:
-			this_is_science = True
-		else:
-			# if its not science its probably a junk web page.
-			this_is_science = False
-		if "semantic" in urlDat.keys():
-			if urlDat["semantic"]:
-				this_is_science = True
-		print(corpus)
-		print(this_is_science,'this_is_science')
-		urlDat["big_words"] = [word for word in corpus if len(word) > 40]
-		ignoreSingleSentences = 1
-
-		corpus = cleanup_pretagger_all(corpus)
-		if verbose:
-			st.text('pretagger all')
-			st.text(type(corpus))
-
-		tokens = word_tokenize(corpus)
-		if verbose:
-			st.text("token input")
-			st.text(tokens)
-		tokens = [ t for t in tokens if t not in not_want_list]
-		if np.mean([ len(t) for t in tokens ])>40:
-			return {}
-		tokens = [ t for t in tokens if len(t)<40 ]
-		if verbose:
-			st.text("token input")
-			st.text(tokens)
-		wc, sc, sylCount, remainingText, wordLen = countWordsSentSyl(
-			tokens, ignoreSingleSentences=1
-		)
-
-		if len(tokens) < WORD_LIM:
-			return {}
-		if len(tokens) >= WORD_LIM:
-
-			remainingText = " ".join(remainingText)
-			remainingText = remainingText.lower()
-			if wc > 0 and sc > 0:
-				meanv,total,hard_snippet = complexityAlongtheText(corpus, chunk_length=128)
-				urlDat["standard_unbiased"] = meanv
-				urlDat["standard"] = total
-				if this_is_science:
-					urlDat["hard_snippet"] = hard_snippet
-				else:
-					urlDat["hard_snippet"] = None
-				#urlDat["fre_unbiased"] = freeAlongtheText(corpus)
-				#fre = FRE(wc, sc, sylCount)
-				#ndc = NDC(
-				#	remainingText, wc, sc
-				#)  # calc NDC Index and Perctage Diff Words                                         #calc NDC index
-				#urlDat["fre"] = fre  # textstat.text_standard(corpus, float_output=True)
-				#urlDat["ndc"] = ndc[0]
-				# textstat.text_standard(corpus, float_output=True)
-				# https://stackoverflow.com/questions/62492797/get-bibliography-list-and-its-count-from-text-python
-
-			#if urlDat["fre_unbiased"]< urlDat["standard"] and urlDat["fre_unbiased"]>0:
-			#	urlDat["standard"] = urlDat["fre_unbiased"]
-			if urlDat["standard_unbiased"]< urlDat["standard"]  and urlDat["standard_unbiased"]>0:
-				urlDat["standard"] = urlDat["standard_unbiased"]
-
-			#urlDat["concensus"] = np.mean(
-			#	[
-			#		np.mean(urlDat["fre"]),
-			#		np.mean(urlDat["ndc"]),
-			#		np.mean(urlDat["standard_unbiased"]),
-			#	]
-			#)
-			tokens = [w.lower() for w in tokens if w.isalpha()]
-			tokens = [w.lower() for w in tokens]  # make everything lower case
-			urlDat["wcount"] = textstat.lexicon_count(str(tokens))
-			word_lim = bool(urlDat["wcount"] > WORD_LIM)
-			urlDat["tokens"] = tokens
-
-			if len(tokens):
-				lexicon = textstat.lexicon_count(corpus, True)
-				urlDat["uniqueness"] = len(set(tokens)) / float(len(tokens))
-				urlDat["unique_words"] = len(set(tokens))
-
-				# It's harder to have a good unique ratio in a long document, as 'and', 'the' and 'a', will dominate.
-				# big deltas mean redudancy/sparse information/information/density
-
-				testimonial = TextBlob(corpus)
-				urlDat["sp"] = testimonial.sentiment.polarity
-				urlDat["ss"] = testimonial.sentiment.subjectivity
-				urlDat["sp_norm"] = np.abs(testimonial.sentiment.polarity)
-				urlDat["ss_norm"] = np.abs(testimonial.sentiment.subjectivity)
-				urlDat["gf"] = textstat.gunning_fog(corpus)
-	if "standard" in urlDat.keys():
-		if urlDat["standard"] == 0:
-			return None
-
-	return urlDat
+
+
+def complexityAlongtheText(
+    text: str, chunk_length: int = 5
+) -> Union[float, float, str]:
+    words = sent_tokenize(text)
+    # words = #text.split()
+    cur = 0
+    stds = []
+    hardest_chunk_index = 0
+    while cur < len(words):
+        sub = words[cur : cur + 5]
+        sub_text = " ".join(sub)
+        std = textstat.text_standard(sub_text, float_output=True)
+        cur += chunk_length
+        if std > hardest_chunk_index:
+            hardest_chunk_index = cur
+        stds.append(std)
+    hard_snippet = words[hardest_chunk_index : hardest_chunk_index + chunk_length]
+    hs = ""
+    for h in hard_snippet:
+        hs += h + str(" ")
+    # st.text(hs)
+    return np.mean(stds), textstat.text_standard(text, float_output=True), hs
+
+
+def freeAlongtheText(text: str, chunk_length: int = 5) -> float:
+    # words = text.split()
+    words = sent_tokenize(text)
+
+    cur = 0
+    stds = []
+    fres = []
+    while cur < len(words):
+        sub = words[cur : cur + chunk_length]
+        sub_text = " ".join(sub)
+        wc, sc, sylCount, remainingText, wordLen = countWordsSentSyl(
+            sub_text, ignoreSingleSentences=1
+        )
+        try:
+            fre = FRE(wc, sc, sylCount)
+            fres.append(fre)
+        except:
+            pass
+        cur += chunk_length
+    return np.mean(fres)
+
+
+def get_ref(references: str):
+    for nubmer, line in enumerate(references, 1):  # skip last element with page number
+        line = line.strip()
+        if line:  # skip empty line
+            authors_and_year = re.match("((.*)\. (\d{4})\.)", line)
+            if type(authors_and_year) is not type(None):
+                text, authors, year = authors_and_year.groups()
+                names = re.split(",[ ]*and |,[ ]*| and ", authors)
+                names = [(name, name.split(" ")[-1]) for name in names]
+
+
+def text_proc(corpus, urlDat={}, WORD_LIM=40, verbose=False):
+    if type(corpus) is type(str()) and corpus not in str(
+        "Redirecting"
+    ):  # and not str("privacy policy") in corpus:
+
+        if str("some error has occurred while processing your request") in corpus:
+            return {}
+        if str("We apologize for the inconvenience...") in corpus:
+            return {}
+        if np.mean([len(w) for w in corpus]) > 35:
+            return {}
+
+        corpus = corpus.replace("/", " ")  # remove characters that nltk can't read
+        corpus = corpus.lower()
+        corpus = corpus.replace(u"\xa0", u" ")
+        corpus = corpus.replace(u"\\", u" ")
+
+        if "abstract" in corpus[0:250]:
+            posa = corpus.lower().find("abstract ")
+            corpus = corpus[posa:]
+        else:
+            posa = False
+
+        if "references" in corpus:
+            posr = corpus.lower().find("references ")
+            corpus = corpus[:posr]
+        else:
+            posr = False
+        if "bibliography" in corpus:
+            posb = corpus.lower().find("bibliography ")
+            corpus = corpus[:posb]
+        else:
+            posb = False
+        if "significance" in corpus:
+            poss = corpus.lower().find("significance ")
+            corpus = corpus[poss:]
+        else:
+            poss = False
+        if "purpose" in corpus[0:250]:
+            posp = corpus.lower().find("purpose")
+            corpus = corpus[:posp]
+        else:
+            posp = False
+
+        if (posa and (posb or posr)) or poss and posp:
+            this_is_science = True
+        else:
+            # if its not science its probably a junk web page.
+            this_is_science = False
+        if "semantic" in urlDat.keys():
+            if urlDat["semantic"]:
+                this_is_science = True
+        print(corpus)
+        print(this_is_science, "this_is_science")
+        urlDat["big_words"] = [word for word in corpus if len(word) > 40]
+        ignoreSingleSentences = 1
+
+        corpus = cleanup_pretagger_all(corpus)
+        if verbose:
+            st.text("pretagger all")
+            st.text(type(corpus))
+
+        tokens = word_tokenize(corpus)
+        if verbose:
+            st.text("token input")
+            st.text(tokens)
+        tokens = [t for t in tokens if t not in not_want_list]
+        if np.mean([len(t) for t in tokens]) > 40:
+            return {}
+        tokens = [t for t in tokens if len(t) < 40]
+        if verbose:
+            st.text("token input")
+            st.text(tokens)
+        wc, sc, sylCount, remainingText, wordLen = countWordsSentSyl(
+            tokens, ignoreSingleSentences=1
+        )
+
+        if len(tokens) < WORD_LIM:
+            return {}
+        if len(tokens) >= WORD_LIM:
+
+            remainingText = " ".join(remainingText)
+            remainingText = remainingText.lower()
+            if wc > 0 and sc > 0:
+                meanv, total, hard_snippet = complexityAlongtheText(
+                    corpus, chunk_length=128
+                )
+                urlDat["standard_unbiased"] = meanv
+                urlDat["standard"] = total
+                if this_is_science:
+                    urlDat["hard_snippet"] = hard_snippet
+                else:
+                    urlDat["hard_snippet"] = None
+                # urlDat["fre_unbiased"] = freeAlongtheText(corpus)
+                # fre = FRE(wc, sc, sylCount)
+                # ndc = NDC(
+                # 	remainingText, wc, sc
+                # )  # calc NDC Index and Perctage Diff Words                                         #calc NDC index
+                # urlDat["fre"] = fre  # textstat.text_standard(corpus, float_output=True)
+                # urlDat["ndc"] = ndc[0]
+                # textstat.text_standard(corpus, float_output=True)
+                # https://stackoverflow.com/questions/62492797/get-bibliography-list-and-its-count-from-text-python
+
+            # if urlDat["fre_unbiased"]< urlDat["standard"] and urlDat["fre_unbiased"]>0:
+            # 	urlDat["standard"] = urlDat["fre_unbiased"]
+            if (
+                urlDat["standard_unbiased"] < urlDat["standard"]
+                and urlDat["standard_unbiased"] > 0
+            ):
+                urlDat["standard"] = urlDat["standard_unbiased"]
+
+            # urlDat["concensus"] = np.mean(
+            # 	[
+            # 		np.mean(urlDat["fre"]),
+            # 		np.mean(urlDat["ndc"]),
+            # 		np.mean(urlDat["standard_unbiased"]),
+            # 	]
+            # )
+            tokens = [w.lower() for w in tokens if w.isalpha()]
+            tokens = [w.lower() for w in tokens]  # make everything lower case
+            urlDat["wcount"] = textstat.lexicon_count(str(tokens))
+            word_lim = bool(urlDat["wcount"] > WORD_LIM)
+            urlDat["tokens"] = tokens
+
+            if len(tokens):
+                lexicon = textstat.lexicon_count(corpus, True)
+                urlDat["uniqueness"] = len(set(tokens)) / float(len(tokens))
+                urlDat["unique_words"] = len(set(tokens))
+
+                # It's harder to have a good unique ratio in a long document, as 'and', 'the' and 'a', will dominate.
+                # big deltas mean redudancy/sparse information/information/density
+
+                testimonial = TextBlob(corpus)
+                urlDat["sp"] = testimonial.sentiment.polarity
+                urlDat["ss"] = testimonial.sentiment.subjectivity
+                urlDat["sp_norm"] = np.abs(testimonial.sentiment.polarity)
+                urlDat["ss_norm"] = np.abs(testimonial.sentiment.subjectivity)
+                urlDat["gf"] = textstat.gunning_fog(corpus)
+    if "standard" in urlDat.keys():
+        if urlDat["standard"] == 0:
+            return None
+
+    return urlDat
 
 
 def process_dics(urlDats):
-	dfs = []
-	for urlDat in tqdm(urlDats):
-		# pandas Data frames are best data container for maths/stats, but steep learning curve.
-		# Other exclusion criteria. Exclude reading levels above grade 100,
-		# as this is most likely a problem with the metric algorithm, and or rubbish data in.
-		# TODO: speed everything up, by performing exclusion criteri above not here.
-		if len(dfs) == 0:
-			dfs = pd.DataFrame(pd.Series(urlDat)).T
-		dfs = pd.concat([dfs, pd.DataFrame(pd.Series(urlDat)).T])
-	return dfs
+    dfs = []
+    for urlDat in tqdm(urlDats):
+        # pandas Data frames are best data container for maths/stats, but steep learning curve.
+        # Other exclusion criteria. Exclude reading levels above grade 100,
+        # as this is most likely a problem with the metric algorithm, and or rubbish data in.
+        # TODO: speed everything up, by performing exclusion criteri above not here.
+        if len(dfs) == 0:
+            dfs = pd.DataFrame(pd.Series(urlDat)).T
+        dfs = pd.concat([dfs, pd.DataFrame(pd.Series(urlDat)).T])
+    return dfs
diff --git a/science_access/word_cloud_by_word_len.py b/science_access/word_cloud_by_word_len.py
index 735b875..65cf7ab 100644
--- a/science_access/word_cloud_by_word_len.py
+++ b/science_access/word_cloud_by_word_len.py
@@ -86,7 +86,9 @@ import copy
 
 from nltk.tokenize import word_tokenize
 import streamlit as st
-def generate_from_lengths(self, words, max_font_size=None,verbose=False):  # noqa: C901
+
+
+def generate_from_lengths(self, words, max_font_size=None, verbose=False):  # noqa: C901
     """Create a word_cloud from words and frequencies.
     Parameters
     ----------
@@ -110,9 +112,9 @@ def generate_from_lengths(self, words, max_font_size=None,verbose=False):  # noq
     self.max_words = 50
     words = word_tokenize(words)
     wordss = list(set(words))
-    wordss = [word for word in wordss if len(word)<20]
+    wordss = [word for word in wordss if len(word) < 20]
 
-    sizes = [len(word) for word in wordss if len(word)<20]
+    sizes = [len(word) for word in wordss if len(word) < 20]
 
     if verbose:
         st.text(wordss)
@@ -120,10 +122,7 @@ def generate_from_lengths(self, words, max_font_size=None,verbose=False):  # noq
 
     max_len = np.max(sizes)
 
-    frequencies = [
-        (word, word_len / max_len)
-        for word, word_len in zip(words, sizes)
-    ]
+    frequencies = [(word, word_len / max_len) for word, word_len in zip(words, sizes)]
     frequencies = sorted(frequencies, key=lambda item: item[1], reverse=True)
     max_frequency = float(frequencies[0][1])
 
@@ -168,10 +167,10 @@ def generate_from_lengths(self, words, max_font_size=None,verbose=False):  # noq
             # we only have one word. We make it big!
             font_size = self.height
         else:
-            #font_size = self.height
-            #self.generate_from_frequencies(
+            # font_size = self.height
+            # self.generate_from_frequencies(
             #    dict(frequencies), max_font_size=self.height
-            #)
+            # )
             # find font sizes
             sizes = [x[1] for x in self.layout_]
             try:
-- 
GitLab