diff --git a/science_access/enter_author_name.py b/science_access/enter_author_name.py
index 2cab4ff143758bb0ce9f53915b5bfcd675d87fdf..9a9ea82d6bb7a96bf5fe6aafb01e3cfcd213685b 100644
--- a/science_access/enter_author_name.py
+++ b/science_access/enter_author_name.py
@@ -110,7 +110,7 @@ def zipf_plot(word_counts_fz):
# @st.cache
-def art_cloud_wl(acorpus:str=""):
+def art_cloud_wl(acorpus: str = ""):
WC = WordCloud(background_color="white")
WC.generate_from_lengths = MethodType(generate_from_lengths, WC)
fig = plt.figure()
@@ -121,7 +121,7 @@ def art_cloud_wl(acorpus:str=""):
if type(acorpus) is type(""):
wordcloud = WC.generate_from_lengths(acorpus)
- if not 'wordcloud' in locals():
+ if not "wordcloud" in locals():
return None, None, None
biggest_words = WC.biggest_words
@@ -142,7 +142,7 @@ def zipf_wrapper(acorpus):
# @st.cache
-def art_cloud(acorpus:str=""):
+def art_cloud(acorpus: str = ""):
# Generate a word cloud image
WC = WordCloud(background_color="white")
@@ -158,11 +158,12 @@ def art_cloud(acorpus:str=""):
return wordcloud, fig, plt
-def fast_art_cloud(acorpus:str=""):
+def fast_art_cloud(acorpus: str = ""):
wordcloud, fig, plt = art_cloud(acorpus)
st.pyplot(fig)
return fig
+
def create_giant_strings(ar, not_want_list):
sci_corpus = ""
first_pass = []
@@ -327,11 +328,13 @@ def grand_distribution_plot(ar, scraped_labels, standard_sci, df0, author_name="
fig.update_layout(width=900, height=600) # , hovermode='x')
return df1, fig
-from typing import List,Any
+
+from typing import List, Any
import pandas as pd
-#import streamlit as st
-#List
-def push_frame_to_screen(contents:Any, readability_vector:List)->pd.DataFrame():
+
+# import streamlit as st
+# List
+def push_frame_to_screen(contents: Any, readability_vector: List) -> pd.DataFrame():
if type(contents) is type(list()):
df_links = pd.DataFrame()
df_links["Web_Link"] = pd.Series(contents)
diff --git a/science_access/online_app_backend.py b/science_access/online_app_backend.py
index 0c6d6ff5afbcc9881f22fadb9da754b6b7897773..1783f87f7e0a9e76326667fa53b0af655448e02a 100644
--- a/science_access/online_app_backend.py
+++ b/science_access/online_app_backend.py
@@ -2,8 +2,9 @@ from typing import List
import PyPDF2
from pathlib import Path
import copy
-#import matplotlib.pyplot as plt
-#import seaborn as sns
+
+# import matplotlib.pyplot as plt
+# import seaborn as sns
import semanticscholar as sch
import os.path
@@ -11,7 +12,7 @@ import pdb
import pickle
from collections import OrderedDict
-#import IPython.display as d
+# import IPython.display as d
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
@@ -188,6 +189,7 @@ def author_to_urls(NAME):
dois.append(li[1])
return dois, coauthors, titles, visit_urls
+
def visit_link(NAME, tns, more_links):
"""
inputs a URL that's full of publication orientated links, preferably the
@@ -209,6 +211,7 @@ def visit_link(NAME, tns, more_links):
return author_results, visit_urls
+
def visit_semantic_scholar_abstracts(NAME, tns, more_links):
"""
inputs a URL that's full of publication orientated links, preferably the
@@ -219,18 +222,18 @@ def visit_semantic_scholar_abstracts(NAME, tns, more_links):
aliases = None
dois, coauthors, titles, visit_urls = author_to_urls(NAME)
for d in dois:
- paper = sch.paper(d, timeout=6)
+ paper = sch.paper(d, timeout=6)
urlDat = {}
- urlDat["link"] = paper['url']
+ urlDat["link"] = paper["url"]
urlDat["semantic"] = True
if aliases is None:
try:
- aliases = get_aliases_and_papers(paper,NAME)
+ aliases = get_aliases_and_papers(paper, NAME)
urlDat["aliases"] = aliases
- print(urlDat["aliases"],'aliases')
+ print(urlDat["aliases"], "aliases")
except:
pass
- urlDat = text_proc(str(paper['abstract']), urlDat)
+ urlDat = text_proc(str(paper["abstract"]), urlDat)
author_results.append(urlDat)
author_results = [
urlDat for urlDat in author_results if not isinstance(urlDat, type(None))
@@ -239,13 +242,15 @@ def visit_semantic_scholar_abstracts(NAME, tns, more_links):
return author_results, visit_urls
-def get_aliases_and_papers(paper,NAME):
- if 'authors' in paper.keys():
- for author_ in paper['authors']:
+def get_aliases_and_papers(paper, NAME):
+ if "authors" in paper.keys():
+ for author_ in paper["authors"]:
if NAME in author_:
- if 'aliases' in author_.keys():
- aliases = author_['aliases']
+ if "aliases" in author_.keys():
+ aliases = author_["aliases"]
return aliases
+
+
def visit_link_unpaywall(NAME, tns, visit_urls):
"""
inputs a URL that's full of publication orientated links, preferably the
@@ -303,7 +308,6 @@ def unpaywall_semantic_links(NAME, tns):
r0 = str("https://api.semanticscholar.org/") + str(doi_)
visit_more_urls.append(r0)
-
r = (
str("https://api.unpaywall.org/v2/")
+ str(doi_)
@@ -327,7 +331,8 @@ def unpaywall_semantic_links(NAME, tns):
visit_more_urls.append(res)
return visit_more_urls
-def convert_pdf_to_txt(content,verbose=False):
+
+def convert_pdf_to_txt(content, verbose=False):
# https://github.com/allenai/science-parse/blob/master/server/README.md
# os.subprocess(curl -v -H "Content-type: application/pdf" --data-binary @paper.pdf "http://scienceparse.allenai.org/v1")
try:
@@ -340,11 +345,11 @@ def convert_pdf_to_txt(content,verbose=False):
write_text = ""
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
- write_text += " "+retstr.getvalue()+" "
+ write_text += " " + retstr.getvalue() + " "
# Process all pages in the document
text = str(write_text)
- mean_word_len = np.mean([ len(t) for t in text ])
- if mean_word_len>33:
+ mean_word_len = np.mean([len(t) for t in text])
+ if mean_word_len > 33:
return str("")
if verbose:
@@ -410,7 +415,9 @@ def process(link, driver): # , REDIRECT=False):
def update_web_form(NAME, tns):
more_links = unpaywall_semantic_links(NAME, tns)
- author_results_temp, visit_urls_temp = visit_semantic_scholar_abstracts(NAME, tns, more_links)
+ author_results_temp, visit_urls_temp = visit_semantic_scholar_abstracts(
+ NAME, tns, more_links
+ )
author_results, visit_urls = visit_link(NAME, tns, more_links)
author_results.extend(author_results_temp)
ar = copy.copy(author_results)
@@ -431,7 +438,7 @@ def find_nearest(array, value):
return idx
-def ar_manipulation(ar:List=[]):
+def ar_manipulation(ar: List = []):
ar = [tl for tl in ar if tl is not None]
ar = [tl for tl in ar if type(tl) is not type(str(""))]
ar = [tl for tl in ar if "standard" in tl.keys()]
@@ -443,11 +450,12 @@ def ar_manipulation(ar:List=[]):
return (ar, trainingDats)
-def call_from_front_end(NAME:str="", OPENACCESS:bool=True, tns:int=16):
+def call_from_front_end(NAME: str = "", OPENACCESS: bool = True, tns: int = 16):
df, datay, ar = update_web_form(NAME, tns)
(ar, trainingDats) = ar_manipulation(ar)
return ar
+
def metricss(rg):
if isinstance(rg, list):
pub_count = len(rg)
diff --git a/science_access/t_analysis.py b/science_access/t_analysis.py
index 54ae8abe0a070b31c9a32358372c1752a0991ac6..c3f3688b04285a6e68b15883c580b5e67a3a5992 100644
--- a/science_access/t_analysis.py
+++ b/science_access/t_analysis.py
@@ -25,7 +25,8 @@ from nltk.sentiment import SentimentAnalyzer
from nltk.tag.perceptron import PerceptronTagger
import nltk
from nltk.corpus import words as english_words
-#from nltk.tokenize import word_tokenize
+
+# from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
@@ -46,262 +47,274 @@ from science_access.readabilityFunctions import countWordsSentSyl, NDC, FRE
tagger = PerceptronTagger(load=False)
not_want_list = [
- "article",
- "articlepubmedpubmed",
- "et",
- "al",
- "text",
- "crossref",
- "isigoogle",
- "cross",
- "ref",
- "google",
- "scholar",
- "article",
- "pubmed",
- "full",
- "doi",
- "org",
- "http",
- "copyright",
- "org",
- "figure",
- "pubmed",
- "accessshoping",
- "articlepubmedpubmed",
- "author",
+ "article",
+ "articlepubmedpubmed",
+ "et",
+ "al",
+ "text",
+ "crossref",
+ "isigoogle",
+ "cross",
+ "ref",
+ "google",
+ "scholar",
+ "article",
+ "pubmed",
+ "full",
+ "doi",
+ "org",
+ "http",
+ "copyright",
+ "org",
+ "figure",
+ "pubmed",
+ "accessshoping",
+ "articlepubmedpubmed",
+ "author",
]
-not_want_list.extend(["link","librarian","issue","abstract","science","cookie","publication"])
+not_want_list.extend(
+ ["link", "librarian", "issue", "abstract", "science", "cookie", "publication"]
+)
def create_giant_strings(ar, not_want_list):
- sci_corpus = ""
- first_pass = []
- for t in ar:
- if "tokens" in t.keys():
- for s in t["tokens"]:
- if s not in not_want_list:
- first_pass.append(s)
- first_pass = set(first_pass)
- for s in first_pass:
- if "/" in s:
- temp = s.split("/") # , " ")
- sci_corpus += str(" ") + temp[0]
- sci_corpus += str(" ") + temp[1]
- if "." in s:
- temp = s.split(".") # , " ")
- sci_corpus += str(" ") + temp[0]
- sci_corpus += str(" ") + temp[1]
- if s not in set(not_want_list):
- sci_corpus += str(" ") + s # +str(' ')
- return sci_corpus
-
-
-#ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words("english"))
+ sci_corpus = ""
+ first_pass = []
+ for t in ar:
+ if "tokens" in t.keys():
+ for s in t["tokens"]:
+ if s not in not_want_list:
+ first_pass.append(s)
+ first_pass = set(first_pass)
+ for s in first_pass:
+ if "/" in s:
+ temp = s.split("/") # , " ")
+ sci_corpus += str(" ") + temp[0]
+ sci_corpus += str(" ") + temp[1]
+ if "." in s:
+ temp = s.split(".") # , " ")
+ sci_corpus += str(" ") + temp[0]
+ sci_corpus += str(" ") + temp[1]
+ if s not in set(not_want_list):
+ sci_corpus += str(" ") + s # +str(' ')
+ return sci_corpus
+
+
+# ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words("english"))
from typing import Union
-def complexityAlongtheText(text:str, chunk_length:int=5)->Union[float,float,str]:
- words = sent_tokenize(text)
- #words = #text.split()
- cur = 0
- stds = []
- hardest_chunk_index = 0
- while cur < len(words):
- sub = words[cur : cur + 5]
- sub_text = " ".join(sub)
- std = textstat.text_standard(sub_text, float_output=True)
- cur += chunk_length
- if std>hardest_chunk_index:
- hardest_chunk_index = cur
- stds.append(std)
- hard_snippet = words[hardest_chunk_index : hardest_chunk_index + chunk_length]
- hs=""
- for h in hard_snippet:
- hs+=h+str(" ")
- #st.text(hs)
- return np.mean(stds), textstat.text_standard(text, float_output=True), hs
-
-def freeAlongtheText(text:str, chunk_length:int=5)->float:
- #words = text.split()
- words = sent_tokenize(text)
-
- cur = 0
- stds = []
- fres = []
- while cur < len(words):
- sub = words[cur : cur + chunk_length]
- sub_text = " ".join(sub)
- wc, sc, sylCount, remainingText, wordLen = countWordsSentSyl(
- sub_text, ignoreSingleSentences=1
- )
- try:
- fre = FRE(wc, sc, sylCount)
- fres.append(fre)
- except:
- pass
- cur += chunk_length
- return np.mean(fres)
-
-
-def get_ref(references:str):
- for nubmer, line in enumerate(references, 1): # skip last element with page number
- line = line.strip()
- if line: # skip empty line
- authors_and_year = re.match("((.*)\. (\d{4})\.)", line)
- if type(authors_and_year) is not type(None):
- text, authors, year = authors_and_year.groups()
- names = re.split(",[ ]*and |,[ ]*| and ", authors)
- names = [(name, name.split(" ")[-1]) for name in names]
-
-def text_proc(corpus, urlDat={}, WORD_LIM=40,verbose=False):
- if type(corpus) is type(str()) and corpus not in str(
- "Redirecting"
- ): # and not str("privacy policy") in corpus:
-
-
- if str("some error has occurred while processing your request") in corpus:
- return {}
- if str("We apologize for the inconvenience...") in corpus:
- return {}
- if np.mean([ len(w) for w in corpus ])>35:
- return {}
-
- corpus = corpus.replace("/", " ") # remove characters that nltk can't read
- corpus = corpus.lower()
- corpus = corpus.replace(u"\xa0", u" ")
- corpus = corpus.replace(u"\\", u" ")
-
- if "abstract" in corpus[0:250]:
- posa = corpus.lower().find("abstract ")
- corpus = corpus[posa:]
- else:
- posa = False
-
- if "references" in corpus:
- posr = corpus.lower().find("references ")
- corpus = corpus[:posr]
- else:
- posr = False
- if "bibliography" in corpus:
- posb = corpus.lower().find("bibliography ")
- corpus = corpus[:posb]
- else:
- posb = False
- if "significance" in corpus:
- poss = corpus.lower().find("significance ")
- corpus = corpus[poss:]
- else:
- poss = False;
- if "purpose" in corpus[0:250]:
- posp = corpus.lower().find("purpose")
- corpus = corpus[:posp]
- else:
- posp = False
-
- if (posa and (posb or posr)) or poss and posp:
- this_is_science = True
- else:
- # if its not science its probably a junk web page.
- this_is_science = False
- if "semantic" in urlDat.keys():
- if urlDat["semantic"]:
- this_is_science = True
- print(corpus)
- print(this_is_science,'this_is_science')
- urlDat["big_words"] = [word for word in corpus if len(word) > 40]
- ignoreSingleSentences = 1
-
- corpus = cleanup_pretagger_all(corpus)
- if verbose:
- st.text('pretagger all')
- st.text(type(corpus))
-
- tokens = word_tokenize(corpus)
- if verbose:
- st.text("token input")
- st.text(tokens)
- tokens = [ t for t in tokens if t not in not_want_list]
- if np.mean([ len(t) for t in tokens ])>40:
- return {}
- tokens = [ t for t in tokens if len(t)<40 ]
- if verbose:
- st.text("token input")
- st.text(tokens)
- wc, sc, sylCount, remainingText, wordLen = countWordsSentSyl(
- tokens, ignoreSingleSentences=1
- )
-
- if len(tokens) < WORD_LIM:
- return {}
- if len(tokens) >= WORD_LIM:
-
- remainingText = " ".join(remainingText)
- remainingText = remainingText.lower()
- if wc > 0 and sc > 0:
- meanv,total,hard_snippet = complexityAlongtheText(corpus, chunk_length=128)
- urlDat["standard_unbiased"] = meanv
- urlDat["standard"] = total
- if this_is_science:
- urlDat["hard_snippet"] = hard_snippet
- else:
- urlDat["hard_snippet"] = None
- #urlDat["fre_unbiased"] = freeAlongtheText(corpus)
- #fre = FRE(wc, sc, sylCount)
- #ndc = NDC(
- # remainingText, wc, sc
- #) # calc NDC Index and Perctage Diff Words #calc NDC index
- #urlDat["fre"] = fre # textstat.text_standard(corpus, float_output=True)
- #urlDat["ndc"] = ndc[0]
- # textstat.text_standard(corpus, float_output=True)
- # https://stackoverflow.com/questions/62492797/get-bibliography-list-and-its-count-from-text-python
-
- #if urlDat["fre_unbiased"]< urlDat["standard"] and urlDat["fre_unbiased"]>0:
- # urlDat["standard"] = urlDat["fre_unbiased"]
- if urlDat["standard_unbiased"]< urlDat["standard"] and urlDat["standard_unbiased"]>0:
- urlDat["standard"] = urlDat["standard_unbiased"]
-
- #urlDat["concensus"] = np.mean(
- # [
- # np.mean(urlDat["fre"]),
- # np.mean(urlDat["ndc"]),
- # np.mean(urlDat["standard_unbiased"]),
- # ]
- #)
- tokens = [w.lower() for w in tokens if w.isalpha()]
- tokens = [w.lower() for w in tokens] # make everything lower case
- urlDat["wcount"] = textstat.lexicon_count(str(tokens))
- word_lim = bool(urlDat["wcount"] > WORD_LIM)
- urlDat["tokens"] = tokens
-
- if len(tokens):
- lexicon = textstat.lexicon_count(corpus, True)
- urlDat["uniqueness"] = len(set(tokens)) / float(len(tokens))
- urlDat["unique_words"] = len(set(tokens))
-
- # It's harder to have a good unique ratio in a long document, as 'and', 'the' and 'a', will dominate.
- # big deltas mean redudancy/sparse information/information/density
-
- testimonial = TextBlob(corpus)
- urlDat["sp"] = testimonial.sentiment.polarity
- urlDat["ss"] = testimonial.sentiment.subjectivity
- urlDat["sp_norm"] = np.abs(testimonial.sentiment.polarity)
- urlDat["ss_norm"] = np.abs(testimonial.sentiment.subjectivity)
- urlDat["gf"] = textstat.gunning_fog(corpus)
- if "standard" in urlDat.keys():
- if urlDat["standard"] == 0:
- return None
-
- return urlDat
+
+
+def complexityAlongtheText(
+ text: str, chunk_length: int = 5
+) -> Union[float, float, str]:
+ words = sent_tokenize(text)
+ # words = #text.split()
+ cur = 0
+ stds = []
+ hardest_chunk_index = 0
+ while cur < len(words):
+ sub = words[cur : cur + 5]
+ sub_text = " ".join(sub)
+ std = textstat.text_standard(sub_text, float_output=True)
+ cur += chunk_length
+ if std > hardest_chunk_index:
+ hardest_chunk_index = cur
+ stds.append(std)
+ hard_snippet = words[hardest_chunk_index : hardest_chunk_index + chunk_length]
+ hs = ""
+ for h in hard_snippet:
+ hs += h + str(" ")
+ # st.text(hs)
+ return np.mean(stds), textstat.text_standard(text, float_output=True), hs
+
+
+def freeAlongtheText(text: str, chunk_length: int = 5) -> float:
+ # words = text.split()
+ words = sent_tokenize(text)
+
+ cur = 0
+ stds = []
+ fres = []
+ while cur < len(words):
+ sub = words[cur : cur + chunk_length]
+ sub_text = " ".join(sub)
+ wc, sc, sylCount, remainingText, wordLen = countWordsSentSyl(
+ sub_text, ignoreSingleSentences=1
+ )
+ try:
+ fre = FRE(wc, sc, sylCount)
+ fres.append(fre)
+ except:
+ pass
+ cur += chunk_length
+ return np.mean(fres)
+
+
+def get_ref(references: str):
+ for nubmer, line in enumerate(references, 1): # skip last element with page number
+ line = line.strip()
+ if line: # skip empty line
+ authors_and_year = re.match("((.*)\. (\d{4})\.)", line)
+ if type(authors_and_year) is not type(None):
+ text, authors, year = authors_and_year.groups()
+ names = re.split(",[ ]*and |,[ ]*| and ", authors)
+ names = [(name, name.split(" ")[-1]) for name in names]
+
+
+def text_proc(corpus, urlDat={}, WORD_LIM=40, verbose=False):
+ if type(corpus) is type(str()) and corpus not in str(
+ "Redirecting"
+ ): # and not str("privacy policy") in corpus:
+
+ if str("some error has occurred while processing your request") in corpus:
+ return {}
+ if str("We apologize for the inconvenience...") in corpus:
+ return {}
+ if np.mean([len(w) for w in corpus]) > 35:
+ return {}
+
+ corpus = corpus.replace("/", " ") # remove characters that nltk can't read
+ corpus = corpus.lower()
+ corpus = corpus.replace(u"\xa0", u" ")
+ corpus = corpus.replace(u"\\", u" ")
+
+ if "abstract" in corpus[0:250]:
+ posa = corpus.lower().find("abstract ")
+ corpus = corpus[posa:]
+ else:
+ posa = False
+
+ if "references" in corpus:
+ posr = corpus.lower().find("references ")
+ corpus = corpus[:posr]
+ else:
+ posr = False
+ if "bibliography" in corpus:
+ posb = corpus.lower().find("bibliography ")
+ corpus = corpus[:posb]
+ else:
+ posb = False
+ if "significance" in corpus:
+ poss = corpus.lower().find("significance ")
+ corpus = corpus[poss:]
+ else:
+ poss = False
+ if "purpose" in corpus[0:250]:
+ posp = corpus.lower().find("purpose")
+ corpus = corpus[:posp]
+ else:
+ posp = False
+
+ if (posa and (posb or posr)) or poss and posp:
+ this_is_science = True
+ else:
+ # if its not science its probably a junk web page.
+ this_is_science = False
+ if "semantic" in urlDat.keys():
+ if urlDat["semantic"]:
+ this_is_science = True
+ print(corpus)
+ print(this_is_science, "this_is_science")
+ urlDat["big_words"] = [word for word in corpus if len(word) > 40]
+ ignoreSingleSentences = 1
+
+ corpus = cleanup_pretagger_all(corpus)
+ if verbose:
+ st.text("pretagger all")
+ st.text(type(corpus))
+
+ tokens = word_tokenize(corpus)
+ if verbose:
+ st.text("token input")
+ st.text(tokens)
+ tokens = [t for t in tokens if t not in not_want_list]
+ if np.mean([len(t) for t in tokens]) > 40:
+ return {}
+ tokens = [t for t in tokens if len(t) < 40]
+ if verbose:
+ st.text("token input")
+ st.text(tokens)
+ wc, sc, sylCount, remainingText, wordLen = countWordsSentSyl(
+ tokens, ignoreSingleSentences=1
+ )
+
+ if len(tokens) < WORD_LIM:
+ return {}
+ if len(tokens) >= WORD_LIM:
+
+ remainingText = " ".join(remainingText)
+ remainingText = remainingText.lower()
+ if wc > 0 and sc > 0:
+ meanv, total, hard_snippet = complexityAlongtheText(
+ corpus, chunk_length=128
+ )
+ urlDat["standard_unbiased"] = meanv
+ urlDat["standard"] = total
+ if this_is_science:
+ urlDat["hard_snippet"] = hard_snippet
+ else:
+ urlDat["hard_snippet"] = None
+ # urlDat["fre_unbiased"] = freeAlongtheText(corpus)
+ # fre = FRE(wc, sc, sylCount)
+ # ndc = NDC(
+ # remainingText, wc, sc
+ # ) # calc NDC Index and Perctage Diff Words #calc NDC index
+ # urlDat["fre"] = fre # textstat.text_standard(corpus, float_output=True)
+ # urlDat["ndc"] = ndc[0]
+ # textstat.text_standard(corpus, float_output=True)
+ # https://stackoverflow.com/questions/62492797/get-bibliography-list-and-its-count-from-text-python
+
+ # if urlDat["fre_unbiased"]< urlDat["standard"] and urlDat["fre_unbiased"]>0:
+ # urlDat["standard"] = urlDat["fre_unbiased"]
+ if (
+ urlDat["standard_unbiased"] < urlDat["standard"]
+ and urlDat["standard_unbiased"] > 0
+ ):
+ urlDat["standard"] = urlDat["standard_unbiased"]
+
+ # urlDat["concensus"] = np.mean(
+ # [
+ # np.mean(urlDat["fre"]),
+ # np.mean(urlDat["ndc"]),
+ # np.mean(urlDat["standard_unbiased"]),
+ # ]
+ # )
+ tokens = [w.lower() for w in tokens if w.isalpha()]
+ tokens = [w.lower() for w in tokens] # make everything lower case
+ urlDat["wcount"] = textstat.lexicon_count(str(tokens))
+ word_lim = bool(urlDat["wcount"] > WORD_LIM)
+ urlDat["tokens"] = tokens
+
+ if len(tokens):
+ lexicon = textstat.lexicon_count(corpus, True)
+ urlDat["uniqueness"] = len(set(tokens)) / float(len(tokens))
+ urlDat["unique_words"] = len(set(tokens))
+
+ # It's harder to have a good unique ratio in a long document, as 'and', 'the' and 'a', will dominate.
+ # big deltas mean redudancy/sparse information/information/density
+
+ testimonial = TextBlob(corpus)
+ urlDat["sp"] = testimonial.sentiment.polarity
+ urlDat["ss"] = testimonial.sentiment.subjectivity
+ urlDat["sp_norm"] = np.abs(testimonial.sentiment.polarity)
+ urlDat["ss_norm"] = np.abs(testimonial.sentiment.subjectivity)
+ urlDat["gf"] = textstat.gunning_fog(corpus)
+ if "standard" in urlDat.keys():
+ if urlDat["standard"] == 0:
+ return None
+
+ return urlDat
def process_dics(urlDats):
- dfs = []
- for urlDat in tqdm(urlDats):
- # pandas Data frames are best data container for maths/stats, but steep learning curve.
- # Other exclusion criteria. Exclude reading levels above grade 100,
- # as this is most likely a problem with the metric algorithm, and or rubbish data in.
- # TODO: speed everything up, by performing exclusion criteri above not here.
- if len(dfs) == 0:
- dfs = pd.DataFrame(pd.Series(urlDat)).T
- dfs = pd.concat([dfs, pd.DataFrame(pd.Series(urlDat)).T])
- return dfs
+ dfs = []
+ for urlDat in tqdm(urlDats):
+ # pandas Data frames are best data container for maths/stats, but steep learning curve.
+ # Other exclusion criteria. Exclude reading levels above grade 100,
+ # as this is most likely a problem with the metric algorithm, and or rubbish data in.
+ # TODO: speed everything up, by performing exclusion criteri above not here.
+ if len(dfs) == 0:
+ dfs = pd.DataFrame(pd.Series(urlDat)).T
+ dfs = pd.concat([dfs, pd.DataFrame(pd.Series(urlDat)).T])
+ return dfs
diff --git a/science_access/word_cloud_by_word_len.py b/science_access/word_cloud_by_word_len.py
index 735b8757344352c78b86112e8cafec23a3a7200a..65cf7ab727c94deeed14a0e8801f71753fe6378a 100644
--- a/science_access/word_cloud_by_word_len.py
+++ b/science_access/word_cloud_by_word_len.py
@@ -86,7 +86,9 @@ import copy
from nltk.tokenize import word_tokenize
import streamlit as st
-def generate_from_lengths(self, words, max_font_size=None,verbose=False): # noqa: C901
+
+
+def generate_from_lengths(self, words, max_font_size=None, verbose=False): # noqa: C901
"""Create a word_cloud from words and frequencies.
Parameters
----------
@@ -110,9 +112,9 @@ def generate_from_lengths(self, words, max_font_size=None,verbose=False): # noq
self.max_words = 50
words = word_tokenize(words)
wordss = list(set(words))
- wordss = [word for word in wordss if len(word)<20]
+ wordss = [word for word in wordss if len(word) < 20]
- sizes = [len(word) for word in wordss if len(word)<20]
+ sizes = [len(word) for word in wordss if len(word) < 20]
if verbose:
st.text(wordss)
@@ -120,10 +122,7 @@ def generate_from_lengths(self, words, max_font_size=None,verbose=False): # noq
max_len = np.max(sizes)
- frequencies = [
- (word, word_len / max_len)
- for word, word_len in zip(words, sizes)
- ]
+ frequencies = [(word, word_len / max_len) for word, word_len in zip(words, sizes)]
frequencies = sorted(frequencies, key=lambda item: item[1], reverse=True)
max_frequency = float(frequencies[0][1])
@@ -168,10 +167,10 @@ def generate_from_lengths(self, words, max_font_size=None,verbose=False): # noq
# we only have one word. We make it big!
font_size = self.height
else:
- #font_size = self.height
- #self.generate_from_frequencies(
+ # font_size = self.height
+ # self.generate_from_frequencies(
# dict(frequencies), max_font_size=self.height
- #)
+ # )
# find font sizes
sizes = [x[1] for x in self.layout_]
try: