AliExpress Parts Python Scraper

veekm

Member
Joined
Mar 29, 2015
Messages
54
Points
8
Location
Bangalore
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.ElementClickInterceptedException: Message: Element <button class="next-btn next-medium next-btn-normal next-pagination-item next-next" type="button"> is not clickable at point (699,649) because another element <div class="next-overlay-backdrop"> obscures it


Anyone know how to fix - it only scrapes a single page: asked here, next-overlay-backdrop obscures it

## Reddit r/indianripoff import re, sys, time from selenium.common.exceptions import NoSuchElementException from selenium.common.exceptions import StaleElementReferenceException from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.desired_capabilities import DesiredCapabilities xchg_rate = 72.6 browser_timeout = 100 url = 'https://www.aliexpress.com' #url = 'file:///tmp/1N4001%20-%20Buy%201N4001%20with%20free%20shipping%20on%20AliExpress.html' caps = DesiredCapabilities().FIREFOX; caps["pageLoadStrategy"] = 'eager' ignored_exceptions=(NoSuchElementException,StaleElementReferenceException,) def load_all_srch_listings(): try: WebDriverWait(driver, browser_timeout, ignored_exceptions=ignored_exceptions).until( EC.presence_of_element_located((By.XPATH, '//div[@class="product-container"]'))) finally: tmp = driver.find_element_by_xpath('//body') if not 'body' in tmp.tag_name: raise SystemExit('no srch_result <body>') tmp.send_keys(Keys.HOME) for i in range(1,10): time.sleep(1) tmp.send_keys(Keys.PAGE_DOWN) #scroll to the page bottom so all listings load def process_lots(e_l): if len(e_l) != 1: raise SystemExit('ambiguous lots' + str(e_l)) lot_e = e_l[0] if (lot_e is None) or (lot_e is 0): raise SystemExit('error in lot value' + str(e_l)) span_e = lot_e.find_element_by_tag_name('span') try: txt = span_e.get_attribute('innerHTML') return txt.replace('pieces / lot', '') except Exception as e: raise def convert(m_obj): if len(m_obj.group()): money = m_obj.group() return str(round(float(money) * xchg_rate, 3)) else: raise SystemExit('ambiguous money ' + str(len(m_obj))+ str(m_obj.group())) import re def process_money(e_l): if len(e_l) > 1: raise SystemExit('ambiguous price' + str(e_l)) if len(e_l) == 0: return str(0) txt = e_l[0].text if txt is None: return str(0) if 'incl' in txt: return str(0) # Shipping incl. if '$' in txt: #clean up our txt txt = txt.replace('+', ''); txt = txt.replace('$', '') txt = txt.replace('US', ''); txt = txt.replace('Shipping:', '') r = re.sub(r'\s*([0-9]+\.[0-9]+)', convert, txt) #convert $ to rupee else: raise SystemExit('unknown value' + txt) return r def process_shop(e_l): if len(e_l) > 1: raise SystemExit('ambiguous price' + str(e_l)) if len(e_l) == 0: return str(0) txt = e_l[0].text if txt is None: return 'error' else: return txt def process_title(e_l): if len(e_l) > 1: raise SystemExit('ambiguous price' + str(e_l)) if len(e_l) == 0: return str(0) txt = e_l[0].get_attribute('title') if txt is None: return 'error' else: return txt def process_container(listings): for c, div in enumerate(listings): log_fh.write('\n<tr>\n') #start row and look for tmp = div.find_elements_by_class_name('price-current') price = process_money(tmp) tmp = div.find_elements_by_class_name('shipping-value') ship_cost = process_money(tmp) tmp = div.find_elements_by_xpath('.//div[@class="item-price-row packaging-sale"]') lot_count = process_lots(tmp) tmp = div.find_elements_by_class_name('store-name') shop_name = process_shop(tmp) tmp = div.find_elements_by_class_name('item-title') title = process_title(tmp) for item in [price, ship_cost, lot_count, shop_name, title]: try: log_fh.write('<td>' + item + '</td>') except Exception as e: print price, ship_cost, lot_count, shop_name, title print item raise log_fh.write('\n</tr>') #end row print 'count of items', c def locate_page_listings_container(): element = \ WebDriverWait(driver, browser_timeout, ignored_exceptions=ignored_exceptions).until( EC.presence_of_element_located((By.XPATH, '//div[@class="product-container"]')) ) EC.presence_of_element_located((By.XPATH, '//div[contains(@class, " product-list")]')) if not 'div' in element.tag_name: SystemExit('no product-container') divs = element.find_elements_by_xpath('.//li[@class="list-item packaging_sale"]') if len(divs): return divs else: raise SystemExit('no listings found') import re def total_page_hits(): global page_hits tmp = driver.find_element_by_class_name('total-page') if tmp is not None: txt = tmp.text r_l = re.findall(r'([0-9]+)', txt) if len(r_l): page_hits = r_l[0] def page_next(): tmp = driver.find_element_by_xpath('//button[contains(@class, " next-next")]') tmp = driver.find_element_by_xpath('.//div[@class="next-overlay-backdrop")]') # e = WebDriverWait(driver, 200).until(EC.element_to_be_clickable( # (By.XPATH, "//button[contains(@class, ' next-next']"))).click() # e.click() if tmp: tmp.click() else: raise SystemExit('no next') def ali_search(url, txt): global page_hits driver.get(url) assert 'AliExpress' in driver.title WebDriverWait(driver, browser_timeout, ignored_exceptions=ignored_exceptions).until( EC.presence_of_element_located((By.XPATH, '//div[@class="search-key-box"]')) ) tmp = driver.find_element_by_id('search-key') if not 'input' in tmp.tag_name: raise SystemExit('failed to locate search element') for c in list(txt): time.sleep(1) tmp.send_keys(c) tmp.send_keys(Keys.RETURN) #issue the search load_all_srch_listings() listings = locate_page_listings_container() process_container(listings) total_page_hits() for x in page_hits: page_next() def close_log(): log_fh.write('\n</table></body>') log_fh.close(); time.sleep(3600) def create_log(fname='/tmp/ali.log'): log_hdr = ''' <!doctype html> <title>parts</title> <body> <table> ''' global log_fh log_fh = open('/tmp/log.html', 'w') log_fh.write(log_hdr) def getpart_and_init(): global driver gecko = '/mnt/sdb1/root/geckodriver'; ffox = '/mnt/sdb1/firefox/firefox-bin' with webdriver.Firefox(executable_path=gecko, firefox_binary=ffox, capabilities=caps) as driver: create_log() if len(sys.argv) == 2: ali_search(url, sys.argv[1]) else: print 'Usage: ' + sys.argv[0] + part close_log() #------------------------------- if __name__ == '__main__': getpart_and_init()
 
Order your Rega Turntables & Amplifiers from HiFiMART.com - India's reputed online dealer.
Top