From 28eafdcef8a50112e5cf282ec016640a55a5191a Mon Sep 17 00:00:00 2001 From: Russell Jarvis <rjjarvis@asu.edu> Date: Fri, 26 Jun 2020 15:37:46 +1000 Subject: [PATCH] push content --- science_access/crawl.py | 5 ++++- science_access/online_app_backend.py | 8 ++++---- science_access/scrape.py | 3 ++- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/science_access/crawl.py b/science_access/crawl.py index 39f420b..f0c0667 100644 --- a/science_access/crawl.py +++ b/science_access/crawl.py @@ -204,5 +204,8 @@ def collect_pubs(url): check_out = link.get('href') #if '/citations?' in check_out: links.append(check_out) - + driver.close() + driver.quit() + driver = None + del driver return links diff --git a/science_access/online_app_backend.py b/science_access/online_app_backend.py index d1db3a1..c5f1b61 100644 --- a/science_access/online_app_backend.py +++ b/science_access/online_app_backend.py @@ -82,15 +82,15 @@ def take_url_from_gui(author_link_scholar_link_list): follow_links = collect_pubs(author_link_scholar_link_list)[0:12] for r in tqdm(follow_links,title='Progess of scraping'): - if heroku: - sleep(np.random.uniform(1,3)) + #if heroku: + # sleep(np.random.uniform(1,3)) try: urlDat = process(r) except: follow_more_links = collect_pubs(r) for r in tqdm(follow_more_links,title='Progess of scraping'): - if heroku: - sleep(np.random.uniform(1,3)) + #if heroku: + # sleep(np.random.uniform(1,3)) urlDat = process(r) if not isinstance(urlDat,type(None)): author_results.append(urlDat) diff --git a/science_access/scrape.py b/science_access/scrape.py index ce122bd..2eb0894 100644 --- a/science_access/scrape.py +++ b/science_access/scrape.py @@ -55,6 +55,7 @@ if 'DYNO' in os.environ: heroku = False else: heroku = True +''' def get_driver(): if 'DYNO' in os.environ: heroku = True @@ -98,7 +99,7 @@ def get_driver(): driver = get_driver() - +''' rsrcmgr = PDFResourceManager() retstr = StringIO() -- GitLab