diff --git a/app.py b/app.py index 392f89638caf4165fec4bcc136394b94b4376164..17c56ee309318f071373665243d3c09704833409 100644 --- a/app.py +++ b/app.py @@ -6,7 +6,6 @@ In the age of growing science communication, this tendency for scientists to use To address this, we created a tool that uses a data-driven approach to provide authors with insights into the readability of the entirety of their published scholarly work with regard to other text repositories. The tool first quantifies an existing text repository [@Soldatova:2007] with complexity shown to be comparable to that of other scientific journals. The tool subsequently uses this output as a reference to show how the readability of user-selected written work compares to this source. Ultimately, this tool will expand upon current readability metrics by computing a more detailed and comparative look at the complexity of written text. We hope that this will allow scientists and other experts to better monitor the complexity of their writing relative to other text types, leading to the creation of more accessible online material. And perhaps more broadly contribute to an improved global communication and understanding of complex topics. Author: [Russell Jarvis](https://github.com/russelljjarvis)\n -Author: [Patrick McGurrin](https://github.com/mcgurrgurr)\n """ @@ -35,10 +34,7 @@ from typing import List, Any from science_access.t_analysis import not_want_list not_want_list.extend(["link","librarian","issue","abstract","science","cookie","publication"]) -from science_access.online_app_backend import call_from_front_end -from science_access.online_app_backend import ar_manipulation - -# from science_access import bokeh_word_cloud +from science_access.online_app_backend import call_from_front_end,ar_manipulation from science_access.enter_author_name import ( art_cloud, @@ -73,7 +69,11 @@ rd_df["Origin"] = ["ReadabilityScienceDeclining" for i in rd_df["Origin"]] rd_labels = rd_df["Origin"] rd_level = rd_df["Reading_Level"] +max = np.max(rd_df["Reading_Level"]) + +#rd_df = rd_df.loc[sample(list(rd_df.index), 999)] rd_df = rd_df.loc[sample(list(rd_df.index), 999)] +rd_df = rd_df[(rd_df["Reading_Level"] > 0)] with open("data/trainingDats.p", "rb") as f: trainingDats = pickle.load(f) @@ -81,13 +81,14 @@ with open("data/trainingDats.p", "rb") as f: biochem_labels = art_df["Origin"] bio_chem_level = art_df["Reading_Level"] -#@st.cache(suppress_st_warning=True) +@st.cache(suppress_st_warning=True) def check_cache(author_name: str,verbose=0): # ->Union[] with shelve.open("fast_graphs_splash.p") as db: flag = author_name in db if not flag: ar = call_from_front_end(author_name) scraped_labels, author_score = frame_to_lists(ar) + ## # This shelve # caching wont scale on heroku. @@ -130,7 +131,9 @@ def show_hardest_passage(ar:List=[])->str: if "hard_snippet" in ar[i].keys() and ar[i]["hard_snippet"] is not None: st.markdown("A hard to read passage from the authors work.") if str("can log in with their society credentials") not in ar[i]["hard_snippet"]: - st.error(ar[i]["hard_snippet"]) + if len(ar[i]["hard_snippet"]): + if "semantic" in ar[i].keys(): + st.error(ar[i]["hard_snippet"]) return ar[i] @@ -153,7 +156,7 @@ def main(): st.title("Search Reading Complexity of an Author") author_name = st.text_input("Enter Author Name:") st.markdown("""Entering a middle initial followed by ```.``` can change the accuracy of results.""") - st.markdown("""Eg. ```Sayali S. Phatak```""") + st.markdown("""Eg. Sayali S```.``` Phatak""") if author_name: @@ -162,7 +165,6 @@ def main(): df_author, merged_df = data_frames_from_scrape( ar, author_name, scraped_labels, author_score, art_df ) - hard = show_hardest_passage(ar) """ ### Links to articles obtained from the queried author. @@ -203,7 +205,6 @@ def main(): round(np.mean(author_score)), 3 ) ) - #try: st.markdown(""" ### Word Frequency Word Cloud""") """ @@ -215,27 +216,32 @@ def main(): """ grab_setr = [] - grab_set1 = [] + grab_set_auth = [] - for block in trainingDats: - grab_setr.extend(block["tokens"]) - for block in ar: - grab_set1.extend(block["tokens"]) + for paper in trainingDats: + grab_setr.extend(paper["tokens"]) + for paper in ar: + grab_set_auth.extend(paper["tokens"]) artset = list(grab_setr) artset.extend(not_want_list) - auth_set = list(set(grab_set1)) - exclusive = [i for i in auth_set if i not in artset] - fig = fast_art_cloud(exclusive) + #auth_set = grab_set_auth + #exclusive = [i for i in grab_set_auth if i not in artset] + fig = fast_art_cloud(grab_set_auth) + hard = show_hardest_passage(ar) + st.markdown("-----") #fast_art_cloud(sci_corpus) clouds_by_big_words = True if clouds_by_big_words: - try: - sci_corpus = create_giant_strings(ar, not_want_list) - clouds_big_words(sci_corpus) - except: - pass + grab_set_auth = [] + for paper in ar: + if "semantic" in paper.keys(): + grab_set_auth.extend(paper["tokens"]) + sci_corpus = create_giant_strings(grab_set_auth, not_want_list) + clouds_big_words(sci_corpus) + #except: + # pass if verbose: st.text(sci_corpus) @@ -305,12 +311,6 @@ def main(): #exclusive = create_giant_strings(ar, exclusive) - sentiment = [] - uniqueness = [] - for block in trainingDats: - uniqueness.append(block["uniqueness"]) - sentiment.append(block["sp"]) - temp = np.mean(sentiment) < np.mean([r["sp"] for r in ar]) if "reading_time" in ar[0].keys(): average_reading_time = [np.mean([r["reading_time"] for r in ar])] @@ -323,21 +323,8 @@ def main(): ) ) - st.markdown("""### Sentiment""") - st.markdown( - """It is {} that the mean sentiment of {}'s writing is more postive relative to that of Readability of the ART Corpus. - """.format( - temp, author_name - ) - ) - - temp = "{0} positive sentiment".format(author_name) - labels = [temp, "ART Corpus positive sentiment"] - values = [np.mean([r["sp"] for r in ar]), np.mean(sentiment)] - - # urlDat["reading_time"] - fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=0.3)]) - st.write(fig) + df, met, author_results = update_web_form_full_text(NAME, tns) + (ar, trainingDats) = ar_manipulation(ar) """ Here are a few additional established text sources of known complexity: @@ -359,11 +346,11 @@ def main(): st.markdown("-----") st.markdown("\n") st.markdown( - "Code Author: [Github](https://github.com/russelljjarvis/)" + "[Code Author: Russell J. Jarvis](https://github.com/russelljjarvis/)" ) st.markdown( - "Source Code: [Github](https://github.com/russelljjarvis/ScienceAccess)" + "[Source Code: Github](https://github.com/russelljjarvis/ScienceAccess)" ) st.markdown( """Note: Search applies [dissmin](https://dissemin.readthedocs.io/en/latest/api.html) semantic scholar and unpaywall APIs""" diff --git a/orcid_scrape.py b/orcid_scrape.py index 1e27a2827180b8f4ffccff9928a881ed82f5b314..033448a7d5a07ba9c869a37c676fb06a137b9100 100644 --- a/orcid_scrape.py +++ b/orcid_scrape.py @@ -1,7 +1,7 @@ import orcid + api = orcid.PublicAPI(institution_key, institution_secret, sandbox=True) -search_results = api.search('text:English', access_token=Token) -#While creating a search query, it is possible to use a generator in order to reduce time needed to fetch a record. -search_results = api.search_generator('text:English', - pagination=20) +search_results = api.search("text:English", access_token=Token) +# While creating a search query, it is possible to use a generator in order to reduce time needed to fetch a record. +search_results = api.search_generator("text:English", pagination=20) first_result = next(search_results) diff --git a/requirements.txt b/requirements.txt index 16c5a80df0e4acf99780c9c2911f4d759d9c2be6..57701279d22da05880bcc96ee645d87913dd0aaf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,7 @@ nltk selenium delver pdfminer +semanticscholar pyvirtualdisplay textstat fsspec>=0.3.3 diff --git a/science_access/enter_author_name.py b/science_access/enter_author_name.py index 9a9ea82d6bb7a96bf5fe6aafb01e3cfcd213685b..687cbe56f34ebc20bac94b562ab65d31d53c76d4 100644 --- a/science_access/enter_author_name.py +++ b/science_access/enter_author_name.py @@ -110,16 +110,21 @@ def zipf_plot(word_counts_fz): # @st.cache -def art_cloud_wl(acorpus: str = ""): +from typing import List +def art_cloud_wl(acorpus): WC = WordCloud(background_color="white") WC.generate_from_lengths = MethodType(generate_from_lengths, WC) fig = plt.figure() if type(acorpus) is not type(str()): - tokens = word_tokenize(acorpus) - if len(tokens): - wordcloud = WC.generate_from_lengths(tokens) - if type(acorpus) is type(""): - wordcloud = WC.generate_from_lengths(acorpus) + temp_str = "" + for a in acorpus: + temp_str+=a+" " + acorpus = temp_str + #tokens = word_tokenize(acorpus) + #if len(tokens): + # wordcloud = WC.generate_from_lengths(tokens) + #if type(acorpus) is type(""): + wordcloud = WC.generate_from_lengths(acorpus) if not "wordcloud" in locals(): return None, None, None @@ -133,16 +138,16 @@ def art_cloud_wl(acorpus: str = ""): st.pyplot(fig) return biggest_words, word_counts_fz, fig - +''' def zipf_wrapper(acorpus): tokens = list(word_tokenize(acorpus)) zipf_plot(tokens) - +''' # @st.cache -def art_cloud(acorpus: str = ""): +def art_cloud(acorpus): # Generate a word cloud image WC = WordCloud(background_color="white") @@ -150,6 +155,12 @@ def art_cloud(acorpus: str = ""): fig = plt.figure() # increase resolution by changing figure size # figsize=(25,25)) + #if type(acorpus) is type(list()): + if type(acorpus) is not type(str()): + temp_str = "" + for a in acorpus: + temp_str+=a+" " + acorpus = temp_str wordcloud = WC.generate(acorpus) # interpolation "nearest decreases resolution." plt.imshow(wordcloud, aspect="auto", interpolation="bilinear") @@ -157,8 +168,8 @@ def art_cloud(acorpus: str = ""): plt.tight_layout(pad=0) return wordcloud, fig, plt - -def fast_art_cloud(acorpus: str = ""): +from typing import Any +def fast_art_cloud(acorpus): wordcloud, fig, plt = art_cloud(acorpus) st.pyplot(fig) return fig diff --git a/science_access/online_app_backend.py b/science_access/online_app_backend.py index 1783f87f7e0a9e76326667fa53b0af655448e02a..3de4414a89b8bda81f2c324a147c005ab0793530 100644 --- a/science_access/online_app_backend.py +++ b/science_access/online_app_backend.py @@ -3,16 +3,13 @@ import PyPDF2 from pathlib import Path import copy -# import matplotlib.pyplot as plt -# import seaborn as sns import semanticscholar as sch import os.path -import pdb -import pickle +#import pdb +#import pickle from collections import OrderedDict -# import IPython.display as d import numpy as np import pandas as pd from bs4 import BeautifulSoup @@ -414,6 +411,17 @@ def process(link, driver): # , REDIRECT=False): def update_web_form(NAME, tns): + more_links = unpaywall_semantic_links(NAME, tns) + author_results_temp, visit_urls_temp = visit_semantic_scholar_abstracts( + NAME, tns, more_links + ) + ar = copy.copy(author_results) + datax = filter_empty(ar) + met = metricss(ar) + df = pd.DataFrame(datax) + return df, met, author_results + +def update_web_form_full_text(NAME, tns): more_links = unpaywall_semantic_links(NAME, tns) author_results_temp, visit_urls_temp = visit_semantic_scholar_abstracts( NAME, tns, more_links @@ -427,6 +435,7 @@ def update_web_form(NAME, tns): return df, met, author_results + def enter_name_here(scholar_page, name, tns): df, datay, author_results = update_web_form(scholar_page, tns) return df, datay, author_results diff --git a/science_access/t_analysis.py b/science_access/t_analysis.py index c3f3688b04285a6e68b15883c580b5e67a3a5992..b05ffb7a6206642ae212b0f150dce5801b13ec3d 100644 --- a/science_access/t_analysis.py +++ b/science_access/t_analysis.py @@ -160,7 +160,41 @@ def get_ref(references: str): text, authors, year = authors_and_year.groups() names = re.split(",[ ]*and |,[ ]*| and ", authors) names = [(name, name.split(" ")[-1]) for name in names] - +def extract_science_block(corpus): + if "abstract" in corpus: + posa = corpus.lower().find("abstract ") + corpus = corpus[posa:] + else: + posa = False + + if "references" in corpus: + posr = corpus.lower().find("references ") + corpus = corpus[:posr] + else: + posr = False + if "bibliography" in corpus: + posb = corpus.lower().find("bibliography ") + corpus = corpus[:posb] + else: + posb = False + if "significance" in corpus: + poss = corpus.lower().find("significance ") + corpus = corpus[poss:] + else: + poss = False + if "purpose" in corpus[0:250]: + posp = corpus.lower().find("purpose") + corpus = corpus[:posp] + else: + posp = False + + if (posa and (posb or posr)) or poss and posp: + this_is_science = True + else: + # if its not science its probably a junk web page. + this_is_science = False + + return corpus, this_is_science def text_proc(corpus, urlDat={}, WORD_LIM=40, verbose=False): if type(corpus) is type(str()) and corpus not in str( @@ -178,42 +212,11 @@ def text_proc(corpus, urlDat={}, WORD_LIM=40, verbose=False): corpus = corpus.lower() corpus = corpus.replace(u"\xa0", u" ") corpus = corpus.replace(u"\\", u" ") - - if "abstract" in corpus[0:250]: - posa = corpus.lower().find("abstract ") - corpus = corpus[posa:] - else: - posa = False - - if "references" in corpus: - posr = corpus.lower().find("references ") - corpus = corpus[:posr] - else: - posr = False - if "bibliography" in corpus: - posb = corpus.lower().find("bibliography ") - corpus = corpus[:posb] - else: - posb = False - if "significance" in corpus: - poss = corpus.lower().find("significance ") - corpus = corpus[poss:] - else: - poss = False - if "purpose" in corpus[0:250]: - posp = corpus.lower().find("purpose") - corpus = corpus[:posp] - else: - posp = False - - if (posa and (posb or posr)) or poss and posp: - this_is_science = True - else: - # if its not science its probably a junk web page. - this_is_science = False + corpus, this_is_science = extract_science_block(corpus) if "semantic" in urlDat.keys(): if urlDat["semantic"]: this_is_science = True + print(corpus) print(this_is_science, "this_is_science") urlDat["big_words"] = [word for word in corpus if len(word) > 40] @@ -255,31 +258,37 @@ def text_proc(corpus, urlDat={}, WORD_LIM=40, verbose=False): urlDat["hard_snippet"] = hard_snippet else: urlDat["hard_snippet"] = None - # urlDat["fre_unbiased"] = freeAlongtheText(corpus) - # fre = FRE(wc, sc, sylCount) - # ndc = NDC( - # remainingText, wc, sc - # ) # calc NDC Index and Perctage Diff Words #calc NDC index - # urlDat["fre"] = fre # textstat.text_standard(corpus, float_output=True) - # urlDat["ndc"] = ndc[0] + urlDat["fre_unbiased"] = freeAlongtheText(corpus) + fre = FRE(wc, sc, sylCount) + if "semantic" in urlDat.keys(): + if urlDat["semantic"]: + ndc = NDC( + remainingText, wc, sc + ) # calc NDC Index and Perctage Diff Words #calc NDC index + # urlDat["fre"] = fre # textstat.text_standard(corpus, float_output=True) + urlDat["standard"] = ndc[0] # textstat.text_standard(corpus, float_output=True) # https://stackoverflow.com/questions/62492797/get-bibliography-list-and-its-count-from-text-python - # if urlDat["fre_unbiased"]< urlDat["standard"] and urlDat["fre_unbiased"]>0: - # urlDat["standard"] = urlDat["fre_unbiased"] + if urlDat["fre_unbiased"]< urlDat["standard"] and urlDat["fre_unbiased"]>0: + urlDat["standard"] = urlDat["fre_unbiased"] if ( urlDat["standard_unbiased"] < urlDat["standard"] and urlDat["standard_unbiased"] > 0 ): urlDat["standard"] = urlDat["standard_unbiased"] - - # urlDat["concensus"] = np.mean( - # [ - # np.mean(urlDat["fre"]), - # np.mean(urlDat["ndc"]), - # np.mean(urlDat["standard_unbiased"]), - # ] - # ) + if fre<urlDat["standard"] and fre>0: + urlDat["standard"] = fre + if urlDat["standard"] > 60 and ndc[0]>0 and ndc[0]<60: + urlDat["standard"] = ndc[0] + + urlDat["concensus"] = np.mean( + [ + np.mean(urlDat["fre"]), + np.mean(urlDat["ndc"]), + np.mean(urlDat["standard_unbiased"]), + ] + ) tokens = [w.lower() for w in tokens if w.isalpha()] tokens = [w.lower() for w in tokens] # make everything lower case urlDat["wcount"] = textstat.lexicon_count(str(tokens)) diff --git a/science_access/word_cloud_by_word_len.py b/science_access/word_cloud_by_word_len.py index 65cf7ab727c94deeed14a0e8801f71753fe6378a..b25ea901b90b67de395ef004bfdfda0bfba0f42b 100644 --- a/science_access/word_cloud_by_word_len.py +++ b/science_access/word_cloud_by_word_len.py @@ -109,12 +109,12 @@ def generate_from_lengths(self, words, max_font_size=None, verbose=False): # no frequencies = frequencies[:self.max_words] """ # largest entry will be 1 - self.max_words = 50 + self.max_words = 100 words = word_tokenize(words) wordss = list(set(words)) - wordss = [word for word in wordss if len(word) < 20] + wordss = [word for word in wordss if len(word)] - sizes = [len(word) for word in wordss if len(word) < 20] + sizes = [len(word) for word in wordss if len(word)] if verbose: st.text(wordss) @@ -123,12 +123,13 @@ def generate_from_lengths(self, words, max_font_size=None, verbose=False): # no max_len = np.max(sizes) frequencies = [(word, word_len / max_len) for word, word_len in zip(words, sizes)] - frequencies = sorted(frequencies, key=lambda item: item[1], reverse=True) + real_frequencies = [w for w in frequencies if w is not None] + + frequencies = sorted(frequencies, key=lambda item: item[1],reverse=True) max_frequency = float(frequencies[0][1]) - real_frequencies = [(wrapper)(w) for w in frequencies] - real_frequencies = [w for w in real_frequencies if w is not None] - frequencies = sorted(real_frequencies, key=lambda item: item[1], reverse=True) + #real_frequencies = [(wrapper)(w) for w in frequencies] + #frequencies = sorted(real_frequencies, key=lambda item: item[1])#, reverse=True) self.word_counts_fz = None self.word_counts_fz = frequencies @@ -167,12 +168,8 @@ def generate_from_lengths(self, words, max_font_size=None, verbose=False): # no # we only have one word. We make it big! font_size = self.height else: - # font_size = self.height - # self.generate_from_frequencies( - # dict(frequencies), max_font_size=self.height - # ) - # find font sizes - sizes = [x[1] for x in self.layout_] + if hasattr(self,'layout_'): + sizes = [x[1] for x in self.layout_] try: font_size = int(2 * sizes[0] * sizes[1] / (sizes[0] + sizes[1])) # quick fix for if self.layout_ contains less than 2 values diff --git a/setup.sh b/setup.sh index 498e22fbb022ca49de6712088d5c271971061568..66de9372e076573f12211eb32635ecfc9c023dfd 100644 --- a/setup.sh +++ b/setup.sh @@ -3,25 +3,25 @@ # # download and install latest geckodriver for linux or mac. # required for selenium to drive a firefox browser. -sudo apt-get update -sudo apt-get install jq wget chromium-chromedriver firefox -sudo python3 -m pip install -r requirements.txt -sudo python3 -m pip install seaborn -sudo python3 -m pip install bs4 -sudo python3 -m pip install natsort dask plotly tabulate -sudo python3 -m conda install -c pyviz holoviews bokeh -sudo conda install -c pyviz holoviews bokeh -sudo python3 -m pip install git+https://github.com/pyviz/holoviews.git +apt-get update +apt-get install jq wget chromium-chromedriver firefox +python3 -m pip install -r requirements.txt +python3 -m pip install seaborn +python3 -m pip install bs4 +python3 -m pip install natsort dask plotly tabulate +python3 -m conda install -c pyviz holoviews bokeh +conda install -c pyviz holoviews bokeh +python3 -m pip install git+https://github.com/pyviz/holoviews.git # hack package installs: git clone https://github.com/pyviz/holoviews.git -cd holoviews; sudo pip install -e .; cd ..; +cd holoviews; pip install -e .; cd ..; git clone https://github.com/kermitt2/grobid_client_python -cd grobid_client_python; sudo pip install -e .; cp grobid_client.py ..;cd ..; +cd grobid_client_python; pip install -e .; cp grobid_client.py ..;cd ..; git clone https://github.com/dissemin/dissemin -cd dissemin; sudo pip install -e .;cd ..; +cd dissemin; pip install -e .;cd ..; wget https://ftp.mozilla.org/pub/firefox/releases/45.0.2/linux-x86_64/en-GB/firefox-45.0.2.tar.bz2 @@ -33,10 +33,11 @@ tar -xvzf geckodriver* chmod +x geckodriver git clone https://russelljjarvis@github.com/russelljjarvis/CoauthorNetVis.git -cd CoauthorNetVis; sudo pip install -e .;cd ..; - +cd CoauthorNetVis; pip install -e .;cd ..; +git clone https://github.com/russelljjarvis/readabilityinscience +cd readabilityinscience; pip install -e .;cd ..;