diff --git a/.binder/postBuild b/.binder/postBuild new file mode 100644 index 0000000000000000000000000000000000000000..e53d6b0b895eba838307faf2f3ebb3e7fbb586a6 --- /dev/null +++ b/.binder/postBuild @@ -0,0 +1,7 @@ + +# enable nbserverproxy +jupyter serverextension enable --sys-prefix nbserverproxy +# streamlit launches at startup +mv .binder/streamlit_call.py ${NB_PYTHON_PREFIX}/lib/python*/site-packages/ +# enable streamlit extension +jupyter serverextension enable --sys-prefix streamlit_call \ No newline at end of file diff --git a/.binder/requirements.txt b/.binder/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..5932971dc8f774666ded5d32e426c3d4d619d093 --- /dev/null +++ b/.binder/requirements.txt @@ -0,0 +1,15 @@ +PyPDF2 +pycld2 +nltk +selenium +delver +pdfminer +pyvirtualdisplay +textstat +fsspec>=0.3.3 +textblob +twython +streamlit +streamlit==0.52.2 +jupyter-server-proxy==1.2.0 +nbserverproxy==0.8.8 \ No newline at end of file diff --git a/.binder/streamlit_call.py b/.binder/streamlit_call.py new file mode 100644 index 0000000000000000000000000000000000000000..81217c61fa200badab8a33111b5fcb57d9654030 --- /dev/null +++ b/.binder/streamlit_call.py @@ -0,0 +1,6 @@ + +from subprocess import Popen + +def load_jupyter_server_extension(nbapp): + """serve the streamlit app""" + Popen(["streamlit", "run", "../entry_point.py", "--browser.serverAddress=0.0.0.0", "--server.enableCORS=False"]) \ No newline at end of file diff --git a/README.md b/README.md index 66e4e01f22bb1517642340f5c02a5a736f68875a..f7c304041d31988c0f71c301156dd475d5ff50ef 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ sudo bash install.sh streamlit run entry_point.py ``` -**** +**** [](https://travis-ci.com/russelljjarvis/ScienceAccessibility) diff --git a/data/_author_specificDavid Grayden.p b/data/_author_specificDavid Grayden.p new file mode 100644 index 0000000000000000000000000000000000000000..aa98c2b47807f91863d3883b02d5d0f896d90ab6 Binary files /dev/null and b/data/_author_specificDavid Grayden.p differ diff --git a/data/_author_specificSayali Phatak.p b/data/_author_specificSayali Phatak.p new file mode 100644 index 0000000000000000000000000000000000000000..970a1fedf27d9677bba0533d1c1e7d1b8ac415b4 Binary files /dev/null and b/data/_author_specificSayali Phatak.p differ diff --git a/data/example_app.png b/data/example_app.png new file mode 100644 index 0000000000000000000000000000000000000000..be5f3cc1cefdc74832a328aa15a9d83e371c9d83 Binary files /dev/null and b/data/example_app.png differ diff --git a/data/more_authors_results.p b/data/more_authors_results.p new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/entry_point.py b/entry_point.py index ee6120073cd24d810c19cb3a0ec1e303866b5ab3..6b11e69d4a0bc5597c6ec296843bc4754e133afe 100644 --- a/entry_point.py +++ b/entry_point.py @@ -20,19 +20,17 @@ except: nltk.download('punkt') nltk.download('stopwords') -if not(os.path.exists('traingDats.p?dl=0') or os.path.exists('traingDats.p')): +if not(os.path.exists('traingDats.p?dl=0') or os.path.exists('data/traingDats.p')): os.system('wget https://www.dropbox.com/s/3h12l5y2pn49c80/traingDats.p?dl=0') - os.system('wget https://www.dropbox.com/s/crarli3772rf3lj/more_authors_results.p?dl=0') os.system('wget https://www.dropbox.com/s/x66zf52himmp5ox/benchmarks.p?dl=0') -if os.path.exists("traingDats.p?dl=0") and not os.path.exists("traingDats.p"): - os.system('mv traingDats.p?dl=0 traingDats.p') - os.system('mv benchmarks.p?dl=0 benchmarks.p') - os.system('mv more_authors_results.p?dl=0 more_authors_results.p') +if os.path.exists("traingDats.p?dl=0") and not os.path.exists("data/traingDats.p"): + os.system('mv traingDats.p?dl=0 data/traingDats.p') + os.system('mv benchmarks.p?dl=0 data/benchmarks.p') -trainingDats = pickle.load(open('traingDats.p','rb')) +trainingDats = pickle.load(open('data/traingDats.p','rb')) bio_chem = [ t['standard'] for t in trainingDats ] biochem_labels = [ x['file_name'] for x in trainingDats if 'file_name' in x.keys()] biochem_labels = [x.split("/")[-1] for x in biochem_labels ] @@ -44,8 +42,16 @@ df0 = pd.DataFrame(lods) theme = px.colors.diverging.Portland colors = [theme[0], theme[1]] -st.title('Search Reading Difficulty of Academic Author') +st.title('Search Reading Difficulty of Academic') author_name = st.text_input('Enter Author:') +def make_clickable(link): + # target _blank to open new window + # extract clickable text to display for your link + text = link#.split('=')[1] + return f'<a target="_blank" href="{link}">{text}</a>' + + + if author_name: ar = call_from_front_end(author_name) standard_sci = [ t['standard'] for t in ar ] @@ -63,9 +69,11 @@ if author_name: # marginal="rug",# marginal='violin',# or violin, rug # hover_data=df.columns) fig0 = px.histogram(df, x="Reading_Level", y="Web_Link", color="Origin", - marginal="violin", + marginal="box", opacity=0.7,# marginal='violin',# or violin, rug - hover_data=df.columns, color_discrete_sequence=colors) + hover_data=df.columns, + hover_name=df["Web_Link"], + color_discrete_sequence=colors) fig0.update_layout(title_text='Scholar scraped {0} Versus Art Corpus'.format(author_name),width=900, height=900)#, hovermode='x') @@ -73,10 +81,8 @@ if author_name: else: - try: - with open('_author_specificSayali Phatak.p','rb') as f: contents = pickle.load(f) - except: - with open('_author_specificDavid Grayden.p','rb') as f: contents = pickle.load(f) + with open('data/_author_specificSayali Phatak.p','rb') as f: + contents = pickle.load(f) (NAME,ar,df,datay,scholar_link) = contents (ar, trainingDats) = ar_manipulation(ar) standard_sci = [ t['standard'] for t in ar ] @@ -93,41 +99,72 @@ else: lods.append({'Reading_Level':i,'Origin':j,'Web_Link':k}) df1 = pd.DataFrame(lods) df = pd.concat([df1,df0]) + + + #df['Web_Link'] = df['Web_Link'].apply(make_clickable) + #df = df.to_html(escape=False) #colors = [colors[0], colors[1]] fig0 = px.histogram(df, x="Reading_Level", y="Web_Link", color="Origin", - marginal="rug", + marginal="box", opacity=0.7,# marginal='violin',# or violin, rug hover_data=df.columns, + hover_name=df["Web_Link"], color_discrete_sequence=colors) fig0.update_layout(title_text='Scholar S Phatak Versus Art Corpus',width=900, height=600)#, hovermode='x') st.write(fig0) ''' + ### Total number scraped documents: + ''' st.text(len(ar)) +if np.mean(standard_sci) < np.mean(bio_chem): + ''' + + + ### This author was easier to read as the average of ARTCORPUS: + A varied collection of biochemistry science papers + ''' + +if np.mean(standard_sci) >= np.mean(bio_chem): + ''' + ### This author was harder or just as hard to read as average of ARTCORPUS: + A varied collection of biochemistry science papers + ''' +df_links = pd.DataFrame() +df_links['Web_Link'] = pd.Series(scraped_labels) +df_links['Reading_Level'] = pd.Series(standard_sci) +#st.write(df) +# link is the column with hyperlinks +df_links['Web_Link'] = df_links['Web_Link'].apply(make_clickable) +df_links = df_links.to_html(escape=False) +st.write(df_links, unsafe_allow_html=True) + x1 = df0['Reading_Level']#np.random.randn(200) x2 = df1['Reading_Level']#np.random.randn(200) + 2 if author_name: group_labels = ['Comparison Data ', str(author_name)] else: - group_labels = ['Comparison Data ', str('search_author')] + group_labels = ['Comparison Data ', str('S Phatak')] # Create distplot with curve_type set to 'normal' colors = [theme[-1], theme[-2]] -rt=list(df['Web_Link']) +#rt=list(df['Web_Link']) +rt=list(pd.Series(scraped_labels)) + #st.text('number scraped documents: {0}'.format(rt)) fig = ff.create_distplot([x1, x2], group_labels, bin_size=2,colors=colors,rug_text=rt) @@ -139,11 +176,49 @@ fig.update_layout(width=900, height=600)#, hovermode='x') st.write(fig) -#print(group_labels) -#group_labels = ['Biochemistry Documents']#, 'Group 2', 'Group 3'] +list_df = pickle.load(open("data/benchmarks.p","rb")) +bm = pd.DataFrame(list_df) + +bm = bm.rename(columns={'link': 'Web_Link', 'standard': 'Reading_Level'}) +bm["Origin"] = pd.Series(["Benchmark" for i in range(0,len(bm))]) +#del bm.loc['nicholas'] +#del bm.loc['local_resource'] +#bm = bm.drop('nicholas', axis=0)) +bm = bm.drop(4, axis=0) -#colors = ['#393E46']#, '#2BCDC1', '#F66095'] +bm_temp = pd.DataFrame() +bm_temp["Origin"] = bm["Origin"] +bm_temp["Web_Link"] = bm["Web_Link"] +bm_temp["Reading_Level"] = bm["Reading_Level"] +import copy +bm = copy.copy(bm_temp) -#fig = ff.create_distplot([standard_sci], group_labels, colors=colors, -# bin_size=[0.3, 0.2, 0.1], show_curve=True) +bm_temp['Web_Link'] = bm_temp['Web_Link'].apply(make_clickable) +bm_temp = bm_temp.to_html(escape=False) +st.write(bm_temp, unsafe_allow_html=True) + +x1 = bm['Reading_Level'] +x2 = df1['Reading_Level'] + +x3 = df0['Reading_Level'] + + +rt=list(bm['Web_Link']) +rt.extend(list(df1['Web_Link'])) +rt.extend(list(df0['Web_Link'])) + +colors = [theme[0], theme[4],theme[2]] +if author_name: + group_labels = ['Ideal Bench Marks ', str(author_name), str('Comparison Data')] +else: + group_labels = ['Ideal Bench Marks ', str('S Phatak'), str('Comparison Data')] + +fig = ff.create_distplot([x1, x2, x3], group_labels, bin_size=1,colors=colors,rug_text=rt) + +hover_trace = [t for t in fig['data'] if 'text' in t] + +fig.update_layout(title_text='Benchmarks versus scraped Author') +fig.update_layout(width=900, height=600)#, hovermode='x') + +st.write(fig) diff --git a/install.sh b/install.sh index f69ebdeb9ce20e96484ff3acd9eed3ae84d2c7f1..8cf2558f067fa40f9fa6c03de619678a9923761d 100644 --- a/install.sh +++ b/install.sh @@ -3,20 +3,21 @@ #!/bin/bash # download and install latest geckodriver for linux or mac. # required for selenium to drive a firefox browser. +pip=$(sudo which pip) sudo /home/user/anaconda3/bin/pip install -r requirements.txt sudo apt-get install jq -sudo pip install PyPDF2 -sudo pip install pycld2 -sudo pip install nltk -sudo pip install selenium -sudo pip install delver -sudo pip install pdfminer -sudo pip install pyvirtualdisplay -sudo pip install textstat -sudo pip install fsspec>=0.3.3 -sudo pip install textblob -sudo pip install twython +sudo /home/user/anaconda3/bin/pip install PyPDF2 +sudo /home/user/anaconda3/bin/pip install pycld2 +sudo /home/user/anaconda3/bin/pip install nltk +sudo /home/user/anaconda3/bin/pip install selenium +sudo /home/user/anaconda3/bin/pip install delver +sudo /home/user/anaconda3/bin/pip install pdfminer +sudo /home/user/anaconda3/bin/pip install pyvirtualdisplay +sudo /home/user/anaconda3/bin/pip install textstat +sudo /home/user/anaconda3/bin/pip install fsspec>=0.3.3 +sudo /home/user/anaconda3/bin/pip install textblob +sudo /home/user/anaconda3/bin/pip install twython sudo python3 -c "import nltk; nltk.download('punkt')" sudo python3 -c "import nltk; nltk.download('stopwords')" sudo bash gecko_install.sh @@ -28,3 +29,4 @@ mv scholar.py .. wget https://www.dropbox.com/s/3h12l5y2pn49c80/traingDats.p?dl=0 wget https://www.dropbox.com/s/crarli3772rf3lj/more_authors_results.p?dl=0 wget https://www.dropbox.com/s/x66zf52himmp5ox/benchmarks.p?dl=0 + \ No newline at end of file diff --git a/online_app_backend.py b/online_app_backend.py index d0bcc1efb38c68b2e0dbbae7eb6b26ac0541a143..3e8a5b86e2097020ede240190157c1b64d81922a 100644 --- a/online_app_backend.py +++ b/online_app_backend.py @@ -172,7 +172,7 @@ def ar_manipulation(ar): #with open(str('more_authors_results.p'),'wb') as f: # pickle.dump([NAME,ar],f) - with open('traingDats.p','rb') as f: + with open('data/traingDats.p','rb') as f: trainingDats = pickle.load(f) trainingDats.extend(ar) diff --git a/requirements.txt b/requirements.txt index 829e47da3d38407dad777fd698a0ec47cb41b9e7..5932971dc8f774666ded5d32e426c3d4d619d093 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,6 @@ fsspec>=0.3.3 textblob twython streamlit +streamlit==0.52.2 +jupyter-server-proxy==1.2.0 +nbserverproxy==0.8.8 \ No newline at end of file