From ce38aaefa082087ab23cdabbbed6d5755b33aa94 Mon Sep 17 00:00:00 2001 From: Russell Jarvis <rjjarvis@asu.edu> Date: Tue, 23 Jun 2020 15:46:05 +1000 Subject: [PATCH] contents --- Procfile | 2 +- app.py | 55 +++++++++++++++++++++++++------------------------------ scrape.py | 12 ++++++++---- 3 files changed, 34 insertions(+), 35 deletions(-) diff --git a/Procfile b/Procfile index 265de86..645733f 100644 --- a/Procfile +++ b/Procfile @@ -1 +1 @@ -web: sh setup.sh && streamlit run app.py +web: sh setup.sh && streamlit run app.py \ No newline at end of file diff --git a/app.py b/app.py index 8bfaa9a..e11bbb8 100644 --- a/app.py +++ b/app.py @@ -1,10 +1,23 @@ + + import streamlit as st import os -st.text(os.system('pwd')) -st.text(os.system('ls *')) -st.text(os.system('../ls *')) -st.text(os.system('../../ls *')) +from selenium import webdriver +import os +from selenium.webdriver.firefox.options import Options +from selenium.common.exceptions import NoSuchElementException + +options = Options() +options.headless = True +try: + #GECKODRIVER_PATH=str(os.getcwd())+str("/geckodriver") + driver = webdriver.Firefox(options=options)#,executable_path=GECKODRIVER_PATH) +except: + os.system("wget https://github.com/mozilla/geckodriver/releases/download/v0.26.0/geckodriver-v0.26.0-linux64.tar.gz") + os.system("tar -xf geckodriver-v0.26.0-linux64.tar.gz") + GECKODRIVER_PATH=str(os.getcwd())+str("/geckodriver") + driver = webdriver.Firefox(options=options,executable_path=GECKODRIVER_PATH) import matplotlib.pyplot as plt @@ -12,15 +25,16 @@ import seaborn as sns from wordcloud import WordCloud -#from online_app_backend import call_from_front_end -from online_app_backend import ar_manipulation import pandas as pd import pickle import numpy as np import plotly.figure_factory as ff import os import plotly.express as px -from plotly.subplots import make_subplots +from online_app_backend import call_from_front_end +from online_app_backend import ar_manipulation + +#from plotly.subplots import make_subplots import nltk try: @@ -61,10 +75,7 @@ def make_clickable(link): return f'<a target="_blank" href="{link}">{text}</a>' - if author_name: - print('waiting') - ''' ar = call_from_front_end(author_name) standard_sci = [ t['standard'] for t in ar ] group_labels = ['Author: '+str(author_name)]#, 'Group 2', 'Group 3'] @@ -77,9 +88,6 @@ if author_name: df1 = pd.DataFrame(lods) df = pd.concat([df1,df0]) - #fig0 = px.histogram(df, x="Reading_Level", y="Web_Link", color="Origin", - # marginal="rug",# marginal='violin',# or violin, rug - # hover_data=df.columns) fig0 = px.histogram(df, x="Reading_Level", y="Web_Link", color="Origin", marginal="box", opacity=0.7,# marginal='violin',# or violin, rug @@ -90,12 +98,9 @@ if author_name: fig0.update_layout(title_text='Scholar scraped {0} Versus Art Corpus'.format(author_name),width=900, height=900)#, hovermode='x') st.write(fig0) - ''' -else: - import os - +else: with open('data/_author_specificSayali Phatak.p','rb') as f: contents = pickle.load(f) @@ -105,11 +110,6 @@ else: scraped_labels = [ str(x['link']) for x in ar] group_labels = ['Author Scraped']#, 'Group 2', 'Group 3'] - #colors = ['#393E46', '#2BCDC1', '#F66095'] - - #fig = ff.create_distplot([standard_sci], group_labels, colors=colors, - # bin_size=[0.3, 0.2, 0.1], show_curve=True) - lods = [] for i,j,k in zip(standard_sci,[str('S Phatak') for i in range(0,len(ar))],scraped_labels): lods.append({'Reading_Level':i,'Origin':j,'Web_Link':k}) @@ -117,10 +117,6 @@ else: df = pd.concat([df1,df0]) - #df['Web_Link'] = df['Web_Link'].apply(make_clickable) - #df = df.to_html(escape=False) - - #colors = [colors[0], colors[1]] fig0 = px.histogram(df, x="Reading_Level", y="Web_Link", color="Origin", marginal="box", @@ -182,8 +178,7 @@ def art_cloud(acorpus): ### Here are some word clouds, that show the frequency of scraped texts -You can eye ball them to see if they fit your intuition -### For your searched author: +You can eye ball them to see if they fit your intuition about what your searched author writes about ''' fig = art_cloud(sci_corpus) @@ -239,8 +234,8 @@ bm_temp['Web_Link'] = bm_temp['Web_Link'].apply(make_clickable) bm_temp = bm_temp.to_html(escape=False) ''' -## In the table below there are benchmarks texts that are -# used to as a comparison to investigate some very easy to read scientific writing. +In the table below there are benchmarks texts that are +used as a comparison to investigate some very easy to read scientific writing. and some very cryptic and unreadable texts too. ''' diff --git a/scrape.py b/scrape.py index 527ff72..cbd5956 100644 --- a/scrape.py +++ b/scrape.py @@ -38,10 +38,9 @@ import io import selenium -from selenium import webdriver -from selenium.webdriver.firefox.options import Options +#from selenium.webdriver.firefox.options import Options import re from bs4 import BeautifulSoup @@ -49,21 +48,26 @@ import bs4 as bs import urllib.request from io import StringIO import io +from selenium import webdriver from selenium.webdriver.firefox.options import Options from selenium.common.exceptions import NoSuchElementException options = Options() options.headless = True +import os try: driver = webdriver.Firefox(options=options) except: - GECKODRIVER_PATH="/app/vendor/geckodriver/geckodriver" + #GECKODRIVER_PATH="/app/vendor/geckodriver/geckodriver" + #driver = webdriver.Firefox(options=options,executable_path=GECKODRIVER_PATH) + os.system("wget https://github.com/mozilla/geckodriver/releases/download/v0.26.0/geckodriver-v0.26.0-linux64.tar.gz") + os.system("tar -xf geckodriver-v0.26.0-linux64.tar.gz") + GECKODRIVER_PATH=str(os.getcwd())+str("/geckodriver") driver = webdriver.Firefox(options=options,executable_path=GECKODRIVER_PATH) - rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() -- GitLab