diff --git a/science_access/get_bmark_corpus.py b/science_access/get_bmark_corpus.py index f8e69333acc0d9874eefdba83d4b0965b420c81f..b0fb4f2d96e61366300c321a3e642cfb41fa6d14 100644 --- a/science_access/get_bmark_corpus.py +++ b/science_access/get_bmark_corpus.py @@ -11,16 +11,24 @@ from .crawl import collect_pubs, convert_pdf_to_txt#,process from .scrape import get_driver from .t_analysis import text_proc from .utils import black_string - +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +if 'DYNO' in os.environ: + heroku = False +else: + heroku = True def process(link): urlDat = {} urlDat['link'] = link urlDat['page_rank'] = 'benchmark' + if heroku: + wait = WebDriverWait(driver, 10) + wait.until(lambda driver: driver.current_url != link) + link = driver.current_url if str('pdf') not in link: - driver = get_driver() - driver.get(link) + crude_html = driver.page_source soup = BeautifulSoup(crude_html, 'html.parser')