AliExpress Parts Python Scraper

veekm · May 30, 2021

raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.ElementClickInterceptedException: Message: Element <button class="next-btn next-medium next-btn-normal next-pagination-item next-next" type="button"> is not clickable at point (699,649) because another element <div class="next-overlay-backdrop"> obscures it

Anyone know how to fix - it only scrapes a single page: asked here, next-overlay-backdrop obscures it


## Reddit r/indianripoff
import re, sys, time

from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

xchg_rate = 72.6
browser_timeout = 100
url = 'https://www.aliexpress.com'
#url = 'file:///tmp/1N4001%20-%20Buy%201N4001%20with%20free%20shipping%20on%20AliExpress.html'

caps = DesiredCapabilities().FIREFOX; caps["pageLoadStrategy"] = 'eager' 
ignored_exceptions=(NoSuchElementException,StaleElementReferenceException,)




def load_all_srch_listings():
    try:
        WebDriverWait(driver, browser_timeout, ignored_exceptions=ignored_exceptions).until( EC.presence_of_element_located((By.XPATH, '//div[@class="product-container"]')))
    finally:
        tmp = driver.find_element_by_xpath('//body')
        if not 'body' in tmp.tag_name: raise SystemExit('no srch_result <body>')
        tmp.send_keys(Keys.HOME) 
        for i in range(1,10):
            time.sleep(1)
            tmp.send_keys(Keys.PAGE_DOWN) #scroll to the page bottom so all listings load

def process_lots(e_l):
    if len(e_l) != 1: raise SystemExit('ambiguous lots' + str(e_l))
    lot_e = e_l[0]
    if (lot_e is None) or (lot_e is 0):
         raise SystemExit('error in lot value' + str(e_l))

    span_e = lot_e.find_element_by_tag_name('span')
    try:
        txt = span_e.get_attribute('innerHTML')
        return txt.replace('pieces / lot', '')
    except Exception as e:
        raise
    

def convert(m_obj):
    if len(m_obj.group()):
        money = m_obj.group()
        return str(round(float(money) * xchg_rate, 3))
    else:
        raise SystemExit('ambiguous money ' + str(len(m_obj))+ str(m_obj.group()))

import re
def process_money(e_l):
    if len(e_l) > 1: raise SystemExit('ambiguous price' + str(e_l))
    if len(e_l) == 0: return str(0)

    txt = e_l[0].text
    if txt is None: return str(0)
    if 'incl' in txt: return str(0) # Shipping incl.

    if '$' in txt: #clean up our txt
        txt = txt.replace('+', ''); txt = txt.replace('$', '')
        txt = txt.replace('US', ''); txt = txt.replace('Shipping:', '')
        
        r = re.sub(r'\s*([0-9]+\.[0-9]+)', convert, txt) #convert $ to rupee
    else:
        raise SystemExit('unknown value' + txt)
    return r

def process_shop(e_l):
    if len(e_l) > 1: raise SystemExit('ambiguous price' + str(e_l))
    if len(e_l) == 0: return str(0)

    txt = e_l[0].text
    if txt is None: return 'error'
    else: return txt

def process_title(e_l):
    if len(e_l) > 1: raise SystemExit('ambiguous price' + str(e_l))
    if len(e_l) == 0: return str(0)

    txt = e_l[0].get_attribute('title')
    if txt is None: return 'error'
    else: return txt
    
def process_container(listings):
    for c, div in enumerate(listings):
        log_fh.write('\n<tr>\n') #start row and look for
        tmp = div.find_elements_by_class_name('price-current')
        price = process_money(tmp)
        
        tmp = div.find_elements_by_class_name('shipping-value')
        ship_cost = process_money(tmp)
        
        tmp = div.find_elements_by_xpath('.//div[@class="item-price-row packaging-sale"]')
        lot_count = process_lots(tmp)
        
        tmp = div.find_elements_by_class_name('store-name')
        shop_name = process_shop(tmp)
        
        tmp = div.find_elements_by_class_name('item-title')
        title = process_title(tmp)
        
        for item in [price, ship_cost, lot_count, shop_name, title]:
            try:
                log_fh.write('<td>' + item + '</td>')
            except Exception as e:
                print price, ship_cost, lot_count, shop_name, title
                print item
                raise
        log_fh.write('\n</tr>') #end row
    print 'count of items', c

def locate_page_listings_container():
    element = \
        WebDriverWait(driver, browser_timeout, ignored_exceptions=ignored_exceptions).until( EC.presence_of_element_located((By.XPATH, '//div[@class="product-container"]')) )
    EC.presence_of_element_located((By.XPATH, '//div[contains(@class, " product-list")]'))
    
    if not 'div' in element.tag_name: SystemExit('no product-container')
    divs = element.find_elements_by_xpath('.//li[@class="list-item packaging_sale"]')
    if len(divs):
        return divs
    else:
        raise SystemExit('no listings found')

import re
def total_page_hits():
    global page_hits
    tmp = driver.find_element_by_class_name('total-page')
    if tmp is not None:
        txt = tmp.text
        r_l = re.findall(r'([0-9]+)', txt)
        if len(r_l): page_hits = r_l[0] 

def page_next():
    tmp = driver.find_element_by_xpath('//button[contains(@class, " next-next")]')
    tmp = driver.find_element_by_xpath('.//div[@class="next-overlay-backdrop")]')
#    e = WebDriverWait(driver, 200).until(EC.element_to_be_clickable(
#        (By.XPATH, "//button[contains(@class, ' next-next']"))).click()
#    e.click()
    
    if tmp: tmp.click()
    else: raise SystemExit('no next')

def ali_search(url, txt):
    global page_hits
    driver.get(url)
    assert 'AliExpress' in driver.title

    WebDriverWait(driver, browser_timeout, ignored_exceptions=ignored_exceptions).until( EC.presence_of_element_located((By.XPATH, '//div[@class="search-key-box"]')) ) 
    
    tmp = driver.find_element_by_id('search-key')
    if not 'input' in tmp.tag_name: raise SystemExit('failed to locate search element')

    for c in list(txt):
        time.sleep(1)
        tmp.send_keys(c)
    tmp.send_keys(Keys.RETURN) #issue the search
    
    load_all_srch_listings()
    listings = locate_page_listings_container()
    process_container(listings)
    total_page_hits()
    for x in page_hits:
        page_next()
    
def close_log():
    log_fh.write('\n</table></body>')
    log_fh.close(); time.sleep(3600)

def create_log(fname='/tmp/ali.log'):
    log_hdr = '''
    <!doctype html>
       <title>parts</title>
       <body>
            <table>
    '''
    global log_fh
    log_fh = open('/tmp/log.html', 'w')
    log_fh.write(log_hdr)

def getpart_and_init():
    global driver
    gecko = '/mnt/sdb1/root/geckodriver'; ffox = '/mnt/sdb1/firefox/firefox-bin'
    with webdriver.Firefox(executable_path=gecko, firefox_binary=ffox, capabilities=caps) as driver:
        create_log()
        
        if len(sys.argv) == 2:
            ali_search(url, sys.argv[1])
        else:
            print 'Usage: ' + sys.argv[0] + part
            
        close_log()
#-------------------------------
if __name__ == '__main__':
    getpart_and_init()

veekm · May 30, 2021

bugs fixed - unicode utf-8

debian Pastezone

AliExpress Parts Python Scraper

veekm

Member

veekm

Member