匿名 发布于2023年1月12日 分享 发布于2023年1月12日 [AppleScript] 纯文本查看 复制代码from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.common.exceptions import TimeoutExceptionfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECimport refrom pyquery import PyQuery as pqimport pandas as pdfrom selenium.webdriver.chrome.options import Optionsfrom time import sleepchrome_options = Options()chrome_options.add_argument('--headless')browser = webdriver.Chrome()wait = WebDriverWait(browser, 10)data = pd.DataFrame()browser.set_window_size(1400, 900)def search(): print('正在搜索') try: browser.get('https://www.taobao.com') input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#q')) ) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button'))) input.send_keys('美食') # 这里输入你要搜索的关键字 submit.click() zh() infor() total = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')) ) return total.text except TimeoutException: search()def zh(): print('正在登录') # 因为我的账号还是ip好像被检测到了有爬虫嫌疑,所以每次搜索会跳转到登录页面。没被检测过的,搜索是不用登录的 try: input_zh = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#fm-login-id')) ) input_key_words = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#fm-login-password')) ) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#login-form > div.fm-btn > button'))) input_zh.send_keys('xxxx') # 这里xxx为输入的账号 input_key_words.send_keys('xxxx') # 这里xxxx为输入的账号密码 submit.click() except TimeoutException: zh()def next_page(page_num): print('正在翻页,目前到第{}页'.format(page_num)) sleep(5) # 因为防止淘宝反爬,所以设置5秒翻页一次 try: input_page_num = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input')) ) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))) input_page_num.clear() input_page_num.send_keys(page_num) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_num))) infor() except TimeoutException: next_page(page_num)def infor(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item'))) html = browser.page_source doc = pq(html) items = doc('#mainsrp-itemlist .items .item').items() for item in items: try: img = item.find('.pic .img').attr('src') # 图片 except: img = '无' try: price = item.find('.price').text() # 价格 except: price = '无' try: deal = item.find('.deal-cnt').text()[:-3] # 出售数量 except: deal = '无' try: goods = item.find('.title').text() # 商品名字 except: goods = '无' try: shop = item.find('.shop').text() # 店铺名字 except: shop = '无' try: location = item.find('.location').text() # 地区 except: location = '无' information = { "img": img, "price": price, "deal": deal, "goods": goods, "shop": shop, "location": location } global data if data.empty: data = pd.DataFrame(information, index=[0]) else: data = data.append(information, ignore_index=True)def main(): browser.get("http://httpbin.org/ip") print(browser.page_source) total = search() total = int(re.compile('(\d+)').search(total).group(1)) for i in range(2, total): # total就是爬取的页数,这里我是自动获取了最大页数,也可以自己设置 next_page(i) data.to_csv(r'D:\python work\taobao_spider\infor\meishi1.csv', encoding='utf-8') # 保存的路径修改成自己的 browser.close() print("爬虫完成")if __name__ == '__main__': main() 意见的链接
推荐的帖子
请登录来提出意见
登录之后,你才能提出意见
现在登录