From b15adc3f34be25e29b0d767ae85e385ef17ee706 Mon Sep 17 00:00:00 2001 From: Russell Jarvis <russelljarvis@protonmail.com> Date: Thu, 11 Mar 2021 14:49:13 +1100 Subject: [PATCH] update --- science_access/enter_author_name.py | 19 +- science_access/online_app_backend.py | 50 ++- science_access/t_analysis.py | 515 ++++++++++++----------- science_access/word_cloud_by_word_len.py | 19 +- 4 files changed, 313 insertions(+), 290 deletions(-) diff --git a/science_access/enter_author_name.py b/science_access/enter_author_name.py index 2cab4ff..9a9ea82 100644 --- a/science_access/enter_author_name.py +++ b/science_access/enter_author_name.py @@ -110,7 +110,7 @@ def zipf_plot(word_counts_fz): # @st.cache -def art_cloud_wl(acorpus:str=""): +def art_cloud_wl(acorpus: str = ""): WC = WordCloud(background_color="white") WC.generate_from_lengths = MethodType(generate_from_lengths, WC) fig = plt.figure() @@ -121,7 +121,7 @@ def art_cloud_wl(acorpus:str=""): if type(acorpus) is type(""): wordcloud = WC.generate_from_lengths(acorpus) - if not 'wordcloud' in locals(): + if not "wordcloud" in locals(): return None, None, None biggest_words = WC.biggest_words @@ -142,7 +142,7 @@ def zipf_wrapper(acorpus): # @st.cache -def art_cloud(acorpus:str=""): +def art_cloud(acorpus: str = ""): # Generate a word cloud image WC = WordCloud(background_color="white") @@ -158,11 +158,12 @@ def art_cloud(acorpus:str=""): return wordcloud, fig, plt -def fast_art_cloud(acorpus:str=""): +def fast_art_cloud(acorpus: str = ""): wordcloud, fig, plt = art_cloud(acorpus) st.pyplot(fig) return fig + def create_giant_strings(ar, not_want_list): sci_corpus = "" first_pass = [] @@ -327,11 +328,13 @@ def grand_distribution_plot(ar, scraped_labels, standard_sci, df0, author_name=" fig.update_layout(width=900, height=600) # , hovermode='x') return df1, fig -from typing import List,Any + +from typing import List, Any import pandas as pd -#import streamlit as st -#List -def push_frame_to_screen(contents:Any, readability_vector:List)->pd.DataFrame(): + +# import streamlit as st +# List +def push_frame_to_screen(contents: Any, readability_vector: List) -> pd.DataFrame(): if type(contents) is type(list()): df_links = pd.DataFrame() df_links["Web_Link"] = pd.Series(contents) diff --git a/science_access/online_app_backend.py b/science_access/online_app_backend.py index 0c6d6ff..1783f87 100644 --- a/science_access/online_app_backend.py +++ b/science_access/online_app_backend.py @@ -2,8 +2,9 @@ from typing import List import PyPDF2 from pathlib import Path import copy -#import matplotlib.pyplot as plt -#import seaborn as sns + +# import matplotlib.pyplot as plt +# import seaborn as sns import semanticscholar as sch import os.path @@ -11,7 +12,7 @@ import pdb import pickle from collections import OrderedDict -#import IPython.display as d +# import IPython.display as d import numpy as np import pandas as pd from bs4 import BeautifulSoup @@ -188,6 +189,7 @@ def author_to_urls(NAME): dois.append(li[1]) return dois, coauthors, titles, visit_urls + def visit_link(NAME, tns, more_links): """ inputs a URL that's full of publication orientated links, preferably the @@ -209,6 +211,7 @@ def visit_link(NAME, tns, more_links): return author_results, visit_urls + def visit_semantic_scholar_abstracts(NAME, tns, more_links): """ inputs a URL that's full of publication orientated links, preferably the @@ -219,18 +222,18 @@ def visit_semantic_scholar_abstracts(NAME, tns, more_links): aliases = None dois, coauthors, titles, visit_urls = author_to_urls(NAME) for d in dois: - paper = sch.paper(d, timeout=6) + paper = sch.paper(d, timeout=6) urlDat = {} - urlDat["link"] = paper['url'] + urlDat["link"] = paper["url"] urlDat["semantic"] = True if aliases is None: try: - aliases = get_aliases_and_papers(paper,NAME) + aliases = get_aliases_and_papers(paper, NAME) urlDat["aliases"] = aliases - print(urlDat["aliases"],'aliases') + print(urlDat["aliases"], "aliases") except: pass - urlDat = text_proc(str(paper['abstract']), urlDat) + urlDat = text_proc(str(paper["abstract"]), urlDat) author_results.append(urlDat) author_results = [ urlDat for urlDat in author_results if not isinstance(urlDat, type(None)) @@ -239,13 +242,15 @@ def visit_semantic_scholar_abstracts(NAME, tns, more_links): return author_results, visit_urls -def get_aliases_and_papers(paper,NAME): - if 'authors' in paper.keys(): - for author_ in paper['authors']: +def get_aliases_and_papers(paper, NAME): + if "authors" in paper.keys(): + for author_ in paper["authors"]: if NAME in author_: - if 'aliases' in author_.keys(): - aliases = author_['aliases'] + if "aliases" in author_.keys(): + aliases = author_["aliases"] return aliases + + def visit_link_unpaywall(NAME, tns, visit_urls): """ inputs a URL that's full of publication orientated links, preferably the @@ -303,7 +308,6 @@ def unpaywall_semantic_links(NAME, tns): r0 = str("https://api.semanticscholar.org/") + str(doi_) visit_more_urls.append(r0) - r = ( str("https://api.unpaywall.org/v2/") + str(doi_) @@ -327,7 +331,8 @@ def unpaywall_semantic_links(NAME, tns): visit_more_urls.append(res) return visit_more_urls -def convert_pdf_to_txt(content,verbose=False): + +def convert_pdf_to_txt(content, verbose=False): # https://github.com/allenai/science-parse/blob/master/server/README.md # os.subprocess(curl -v -H "Content-type: application/pdf" --data-binary @paper.pdf "http://scienceparse.allenai.org/v1") try: @@ -340,11 +345,11 @@ def convert_pdf_to_txt(content,verbose=False): write_text = "" for page in PDFPage.create_pages(document): interpreter.process_page(page) - write_text += " "+retstr.getvalue()+" " + write_text += " " + retstr.getvalue() + " " # Process all pages in the document text = str(write_text) - mean_word_len = np.mean([ len(t) for t in text ]) - if mean_word_len>33: + mean_word_len = np.mean([len(t) for t in text]) + if mean_word_len > 33: return str("") if verbose: @@ -410,7 +415,9 @@ def process(link, driver): # , REDIRECT=False): def update_web_form(NAME, tns): more_links = unpaywall_semantic_links(NAME, tns) - author_results_temp, visit_urls_temp = visit_semantic_scholar_abstracts(NAME, tns, more_links) + author_results_temp, visit_urls_temp = visit_semantic_scholar_abstracts( + NAME, tns, more_links + ) author_results, visit_urls = visit_link(NAME, tns, more_links) author_results.extend(author_results_temp) ar = copy.copy(author_results) @@ -431,7 +438,7 @@ def find_nearest(array, value): return idx -def ar_manipulation(ar:List=[]): +def ar_manipulation(ar: List = []): ar = [tl for tl in ar if tl is not None] ar = [tl for tl in ar if type(tl) is not type(str(""))] ar = [tl for tl in ar if "standard" in tl.keys()] @@ -443,11 +450,12 @@ def ar_manipulation(ar:List=[]): return (ar, trainingDats) -def call_from_front_end(NAME:str="", OPENACCESS:bool=True, tns:int=16): +def call_from_front_end(NAME: str = "", OPENACCESS: bool = True, tns: int = 16): df, datay, ar = update_web_form(NAME, tns) (ar, trainingDats) = ar_manipulation(ar) return ar + def metricss(rg): if isinstance(rg, list): pub_count = len(rg) diff --git a/science_access/t_analysis.py b/science_access/t_analysis.py index 54ae8ab..c3f3688 100644 --- a/science_access/t_analysis.py +++ b/science_access/t_analysis.py @@ -25,7 +25,8 @@ from nltk.sentiment import SentimentAnalyzer from nltk.tag.perceptron import PerceptronTagger import nltk from nltk.corpus import words as english_words -#from nltk.tokenize import word_tokenize + +# from nltk.tokenize import word_tokenize from nltk.tokenize import sent_tokenize, word_tokenize import numpy as np @@ -46,262 +47,274 @@ from science_access.readabilityFunctions import countWordsSentSyl, NDC, FRE tagger = PerceptronTagger(load=False) not_want_list = [ - "article", - "articlepubmedpubmed", - "et", - "al", - "text", - "crossref", - "isigoogle", - "cross", - "ref", - "google", - "scholar", - "article", - "pubmed", - "full", - "doi", - "org", - "http", - "copyright", - "org", - "figure", - "pubmed", - "accessshoping", - "articlepubmedpubmed", - "author", + "article", + "articlepubmedpubmed", + "et", + "al", + "text", + "crossref", + "isigoogle", + "cross", + "ref", + "google", + "scholar", + "article", + "pubmed", + "full", + "doi", + "org", + "http", + "copyright", + "org", + "figure", + "pubmed", + "accessshoping", + "articlepubmedpubmed", + "author", ] -not_want_list.extend(["link","librarian","issue","abstract","science","cookie","publication"]) +not_want_list.extend( + ["link", "librarian", "issue", "abstract", "science", "cookie", "publication"] +) def create_giant_strings(ar, not_want_list): - sci_corpus = "" - first_pass = [] - for t in ar: - if "tokens" in t.keys(): - for s in t["tokens"]: - if s not in not_want_list: - first_pass.append(s) - first_pass = set(first_pass) - for s in first_pass: - if "/" in s: - temp = s.split("/") # , " ") - sci_corpus += str(" ") + temp[0] - sci_corpus += str(" ") + temp[1] - if "." in s: - temp = s.split(".") # , " ") - sci_corpus += str(" ") + temp[0] - sci_corpus += str(" ") + temp[1] - if s not in set(not_want_list): - sci_corpus += str(" ") + s # +str(' ') - return sci_corpus - - -#ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words("english")) + sci_corpus = "" + first_pass = [] + for t in ar: + if "tokens" in t.keys(): + for s in t["tokens"]: + if s not in not_want_list: + first_pass.append(s) + first_pass = set(first_pass) + for s in first_pass: + if "/" in s: + temp = s.split("/") # , " ") + sci_corpus += str(" ") + temp[0] + sci_corpus += str(" ") + temp[1] + if "." in s: + temp = s.split(".") # , " ") + sci_corpus += str(" ") + temp[0] + sci_corpus += str(" ") + temp[1] + if s not in set(not_want_list): + sci_corpus += str(" ") + s # +str(' ') + return sci_corpus + + +# ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words("english")) from typing import Union -def complexityAlongtheText(text:str, chunk_length:int=5)->Union[float,float,str]: - words = sent_tokenize(text) - #words = #text.split() - cur = 0 - stds = [] - hardest_chunk_index = 0 - while cur < len(words): - sub = words[cur : cur + 5] - sub_text = " ".join(sub) - std = textstat.text_standard(sub_text, float_output=True) - cur += chunk_length - if std>hardest_chunk_index: - hardest_chunk_index = cur - stds.append(std) - hard_snippet = words[hardest_chunk_index : hardest_chunk_index + chunk_length] - hs="" - for h in hard_snippet: - hs+=h+str(" ") - #st.text(hs) - return np.mean(stds), textstat.text_standard(text, float_output=True), hs - -def freeAlongtheText(text:str, chunk_length:int=5)->float: - #words = text.split() - words = sent_tokenize(text) - - cur = 0 - stds = [] - fres = [] - while cur < len(words): - sub = words[cur : cur + chunk_length] - sub_text = " ".join(sub) - wc, sc, sylCount, remainingText, wordLen = countWordsSentSyl( - sub_text, ignoreSingleSentences=1 - ) - try: - fre = FRE(wc, sc, sylCount) - fres.append(fre) - except: - pass - cur += chunk_length - return np.mean(fres) - - -def get_ref(references:str): - for nubmer, line in enumerate(references, 1): # skip last element with page number - line = line.strip() - if line: # skip empty line - authors_and_year = re.match("((.*)\. (\d{4})\.)", line) - if type(authors_and_year) is not type(None): - text, authors, year = authors_and_year.groups() - names = re.split(",[ ]*and |,[ ]*| and ", authors) - names = [(name, name.split(" ")[-1]) for name in names] - -def text_proc(corpus, urlDat={}, WORD_LIM=40,verbose=False): - if type(corpus) is type(str()) and corpus not in str( - "Redirecting" - ): # and not str("privacy policy") in corpus: - - - if str("some error has occurred while processing your request") in corpus: - return {} - if str("We apologize for the inconvenience...") in corpus: - return {} - if np.mean([ len(w) for w in corpus ])>35: - return {} - - corpus = corpus.replace("/", " ") # remove characters that nltk can't read - corpus = corpus.lower() - corpus = corpus.replace(u"\xa0", u" ") - corpus = corpus.replace(u"\\", u" ") - - if "abstract" in corpus[0:250]: - posa = corpus.lower().find("abstract ") - corpus = corpus[posa:] - else: - posa = False - - if "references" in corpus: - posr = corpus.lower().find("references ") - corpus = corpus[:posr] - else: - posr = False - if "bibliography" in corpus: - posb = corpus.lower().find("bibliography ") - corpus = corpus[:posb] - else: - posb = False - if "significance" in corpus: - poss = corpus.lower().find("significance ") - corpus = corpus[poss:] - else: - poss = False; - if "purpose" in corpus[0:250]: - posp = corpus.lower().find("purpose") - corpus = corpus[:posp] - else: - posp = False - - if (posa and (posb or posr)) or poss and posp: - this_is_science = True - else: - # if its not science its probably a junk web page. - this_is_science = False - if "semantic" in urlDat.keys(): - if urlDat["semantic"]: - this_is_science = True - print(corpus) - print(this_is_science,'this_is_science') - urlDat["big_words"] = [word for word in corpus if len(word) > 40] - ignoreSingleSentences = 1 - - corpus = cleanup_pretagger_all(corpus) - if verbose: - st.text('pretagger all') - st.text(type(corpus)) - - tokens = word_tokenize(corpus) - if verbose: - st.text("token input") - st.text(tokens) - tokens = [ t for t in tokens if t not in not_want_list] - if np.mean([ len(t) for t in tokens ])>40: - return {} - tokens = [ t for t in tokens if len(t)<40 ] - if verbose: - st.text("token input") - st.text(tokens) - wc, sc, sylCount, remainingText, wordLen = countWordsSentSyl( - tokens, ignoreSingleSentences=1 - ) - - if len(tokens) < WORD_LIM: - return {} - if len(tokens) >= WORD_LIM: - - remainingText = " ".join(remainingText) - remainingText = remainingText.lower() - if wc > 0 and sc > 0: - meanv,total,hard_snippet = complexityAlongtheText(corpus, chunk_length=128) - urlDat["standard_unbiased"] = meanv - urlDat["standard"] = total - if this_is_science: - urlDat["hard_snippet"] = hard_snippet - else: - urlDat["hard_snippet"] = None - #urlDat["fre_unbiased"] = freeAlongtheText(corpus) - #fre = FRE(wc, sc, sylCount) - #ndc = NDC( - # remainingText, wc, sc - #) # calc NDC Index and Perctage Diff Words #calc NDC index - #urlDat["fre"] = fre # textstat.text_standard(corpus, float_output=True) - #urlDat["ndc"] = ndc[0] - # textstat.text_standard(corpus, float_output=True) - # https://stackoverflow.com/questions/62492797/get-bibliography-list-and-its-count-from-text-python - - #if urlDat["fre_unbiased"]< urlDat["standard"] and urlDat["fre_unbiased"]>0: - # urlDat["standard"] = urlDat["fre_unbiased"] - if urlDat["standard_unbiased"]< urlDat["standard"] and urlDat["standard_unbiased"]>0: - urlDat["standard"] = urlDat["standard_unbiased"] - - #urlDat["concensus"] = np.mean( - # [ - # np.mean(urlDat["fre"]), - # np.mean(urlDat["ndc"]), - # np.mean(urlDat["standard_unbiased"]), - # ] - #) - tokens = [w.lower() for w in tokens if w.isalpha()] - tokens = [w.lower() for w in tokens] # make everything lower case - urlDat["wcount"] = textstat.lexicon_count(str(tokens)) - word_lim = bool(urlDat["wcount"] > WORD_LIM) - urlDat["tokens"] = tokens - - if len(tokens): - lexicon = textstat.lexicon_count(corpus, True) - urlDat["uniqueness"] = len(set(tokens)) / float(len(tokens)) - urlDat["unique_words"] = len(set(tokens)) - - # It's harder to have a good unique ratio in a long document, as 'and', 'the' and 'a', will dominate. - # big deltas mean redudancy/sparse information/information/density - - testimonial = TextBlob(corpus) - urlDat["sp"] = testimonial.sentiment.polarity - urlDat["ss"] = testimonial.sentiment.subjectivity - urlDat["sp_norm"] = np.abs(testimonial.sentiment.polarity) - urlDat["ss_norm"] = np.abs(testimonial.sentiment.subjectivity) - urlDat["gf"] = textstat.gunning_fog(corpus) - if "standard" in urlDat.keys(): - if urlDat["standard"] == 0: - return None - - return urlDat + + +def complexityAlongtheText( + text: str, chunk_length: int = 5 +) -> Union[float, float, str]: + words = sent_tokenize(text) + # words = #text.split() + cur = 0 + stds = [] + hardest_chunk_index = 0 + while cur < len(words): + sub = words[cur : cur + 5] + sub_text = " ".join(sub) + std = textstat.text_standard(sub_text, float_output=True) + cur += chunk_length + if std > hardest_chunk_index: + hardest_chunk_index = cur + stds.append(std) + hard_snippet = words[hardest_chunk_index : hardest_chunk_index + chunk_length] + hs = "" + for h in hard_snippet: + hs += h + str(" ") + # st.text(hs) + return np.mean(stds), textstat.text_standard(text, float_output=True), hs + + +def freeAlongtheText(text: str, chunk_length: int = 5) -> float: + # words = text.split() + words = sent_tokenize(text) + + cur = 0 + stds = [] + fres = [] + while cur < len(words): + sub = words[cur : cur + chunk_length] + sub_text = " ".join(sub) + wc, sc, sylCount, remainingText, wordLen = countWordsSentSyl( + sub_text, ignoreSingleSentences=1 + ) + try: + fre = FRE(wc, sc, sylCount) + fres.append(fre) + except: + pass + cur += chunk_length + return np.mean(fres) + + +def get_ref(references: str): + for nubmer, line in enumerate(references, 1): # skip last element with page number + line = line.strip() + if line: # skip empty line + authors_and_year = re.match("((.*)\. (\d{4})\.)", line) + if type(authors_and_year) is not type(None): + text, authors, year = authors_and_year.groups() + names = re.split(",[ ]*and |,[ ]*| and ", authors) + names = [(name, name.split(" ")[-1]) for name in names] + + +def text_proc(corpus, urlDat={}, WORD_LIM=40, verbose=False): + if type(corpus) is type(str()) and corpus not in str( + "Redirecting" + ): # and not str("privacy policy") in corpus: + + if str("some error has occurred while processing your request") in corpus: + return {} + if str("We apologize for the inconvenience...") in corpus: + return {} + if np.mean([len(w) for w in corpus]) > 35: + return {} + + corpus = corpus.replace("/", " ") # remove characters that nltk can't read + corpus = corpus.lower() + corpus = corpus.replace(u"\xa0", u" ") + corpus = corpus.replace(u"\\", u" ") + + if "abstract" in corpus[0:250]: + posa = corpus.lower().find("abstract ") + corpus = corpus[posa:] + else: + posa = False + + if "references" in corpus: + posr = corpus.lower().find("references ") + corpus = corpus[:posr] + else: + posr = False + if "bibliography" in corpus: + posb = corpus.lower().find("bibliography ") + corpus = corpus[:posb] + else: + posb = False + if "significance" in corpus: + poss = corpus.lower().find("significance ") + corpus = corpus[poss:] + else: + poss = False + if "purpose" in corpus[0:250]: + posp = corpus.lower().find("purpose") + corpus = corpus[:posp] + else: + posp = False + + if (posa and (posb or posr)) or poss and posp: + this_is_science = True + else: + # if its not science its probably a junk web page. + this_is_science = False + if "semantic" in urlDat.keys(): + if urlDat["semantic"]: + this_is_science = True + print(corpus) + print(this_is_science, "this_is_science") + urlDat["big_words"] = [word for word in corpus if len(word) > 40] + ignoreSingleSentences = 1 + + corpus = cleanup_pretagger_all(corpus) + if verbose: + st.text("pretagger all") + st.text(type(corpus)) + + tokens = word_tokenize(corpus) + if verbose: + st.text("token input") + st.text(tokens) + tokens = [t for t in tokens if t not in not_want_list] + if np.mean([len(t) for t in tokens]) > 40: + return {} + tokens = [t for t in tokens if len(t) < 40] + if verbose: + st.text("token input") + st.text(tokens) + wc, sc, sylCount, remainingText, wordLen = countWordsSentSyl( + tokens, ignoreSingleSentences=1 + ) + + if len(tokens) < WORD_LIM: + return {} + if len(tokens) >= WORD_LIM: + + remainingText = " ".join(remainingText) + remainingText = remainingText.lower() + if wc > 0 and sc > 0: + meanv, total, hard_snippet = complexityAlongtheText( + corpus, chunk_length=128 + ) + urlDat["standard_unbiased"] = meanv + urlDat["standard"] = total + if this_is_science: + urlDat["hard_snippet"] = hard_snippet + else: + urlDat["hard_snippet"] = None + # urlDat["fre_unbiased"] = freeAlongtheText(corpus) + # fre = FRE(wc, sc, sylCount) + # ndc = NDC( + # remainingText, wc, sc + # ) # calc NDC Index and Perctage Diff Words #calc NDC index + # urlDat["fre"] = fre # textstat.text_standard(corpus, float_output=True) + # urlDat["ndc"] = ndc[0] + # textstat.text_standard(corpus, float_output=True) + # https://stackoverflow.com/questions/62492797/get-bibliography-list-and-its-count-from-text-python + + # if urlDat["fre_unbiased"]< urlDat["standard"] and urlDat["fre_unbiased"]>0: + # urlDat["standard"] = urlDat["fre_unbiased"] + if ( + urlDat["standard_unbiased"] < urlDat["standard"] + and urlDat["standard_unbiased"] > 0 + ): + urlDat["standard"] = urlDat["standard_unbiased"] + + # urlDat["concensus"] = np.mean( + # [ + # np.mean(urlDat["fre"]), + # np.mean(urlDat["ndc"]), + # np.mean(urlDat["standard_unbiased"]), + # ] + # ) + tokens = [w.lower() for w in tokens if w.isalpha()] + tokens = [w.lower() for w in tokens] # make everything lower case + urlDat["wcount"] = textstat.lexicon_count(str(tokens)) + word_lim = bool(urlDat["wcount"] > WORD_LIM) + urlDat["tokens"] = tokens + + if len(tokens): + lexicon = textstat.lexicon_count(corpus, True) + urlDat["uniqueness"] = len(set(tokens)) / float(len(tokens)) + urlDat["unique_words"] = len(set(tokens)) + + # It's harder to have a good unique ratio in a long document, as 'and', 'the' and 'a', will dominate. + # big deltas mean redudancy/sparse information/information/density + + testimonial = TextBlob(corpus) + urlDat["sp"] = testimonial.sentiment.polarity + urlDat["ss"] = testimonial.sentiment.subjectivity + urlDat["sp_norm"] = np.abs(testimonial.sentiment.polarity) + urlDat["ss_norm"] = np.abs(testimonial.sentiment.subjectivity) + urlDat["gf"] = textstat.gunning_fog(corpus) + if "standard" in urlDat.keys(): + if urlDat["standard"] == 0: + return None + + return urlDat def process_dics(urlDats): - dfs = [] - for urlDat in tqdm(urlDats): - # pandas Data frames are best data container for maths/stats, but steep learning curve. - # Other exclusion criteria. Exclude reading levels above grade 100, - # as this is most likely a problem with the metric algorithm, and or rubbish data in. - # TODO: speed everything up, by performing exclusion criteri above not here. - if len(dfs) == 0: - dfs = pd.DataFrame(pd.Series(urlDat)).T - dfs = pd.concat([dfs, pd.DataFrame(pd.Series(urlDat)).T]) - return dfs + dfs = [] + for urlDat in tqdm(urlDats): + # pandas Data frames are best data container for maths/stats, but steep learning curve. + # Other exclusion criteria. Exclude reading levels above grade 100, + # as this is most likely a problem with the metric algorithm, and or rubbish data in. + # TODO: speed everything up, by performing exclusion criteri above not here. + if len(dfs) == 0: + dfs = pd.DataFrame(pd.Series(urlDat)).T + dfs = pd.concat([dfs, pd.DataFrame(pd.Series(urlDat)).T]) + return dfs diff --git a/science_access/word_cloud_by_word_len.py b/science_access/word_cloud_by_word_len.py index 735b875..65cf7ab 100644 --- a/science_access/word_cloud_by_word_len.py +++ b/science_access/word_cloud_by_word_len.py @@ -86,7 +86,9 @@ import copy from nltk.tokenize import word_tokenize import streamlit as st -def generate_from_lengths(self, words, max_font_size=None,verbose=False): # noqa: C901 + + +def generate_from_lengths(self, words, max_font_size=None, verbose=False): # noqa: C901 """Create a word_cloud from words and frequencies. Parameters ---------- @@ -110,9 +112,9 @@ def generate_from_lengths(self, words, max_font_size=None,verbose=False): # noq self.max_words = 50 words = word_tokenize(words) wordss = list(set(words)) - wordss = [word for word in wordss if len(word)<20] + wordss = [word for word in wordss if len(word) < 20] - sizes = [len(word) for word in wordss if len(word)<20] + sizes = [len(word) for word in wordss if len(word) < 20] if verbose: st.text(wordss) @@ -120,10 +122,7 @@ def generate_from_lengths(self, words, max_font_size=None,verbose=False): # noq max_len = np.max(sizes) - frequencies = [ - (word, word_len / max_len) - for word, word_len in zip(words, sizes) - ] + frequencies = [(word, word_len / max_len) for word, word_len in zip(words, sizes)] frequencies = sorted(frequencies, key=lambda item: item[1], reverse=True) max_frequency = float(frequencies[0][1]) @@ -168,10 +167,10 @@ def generate_from_lengths(self, words, max_font_size=None,verbose=False): # noq # we only have one word. We make it big! font_size = self.height else: - #font_size = self.height - #self.generate_from_frequencies( + # font_size = self.height + # self.generate_from_frequencies( # dict(frequencies), max_font_size=self.height - #) + # ) # find font sizes sizes = [x[1] for x in self.layout_] try: -- GitLab