欢迎来到15资源

nsfw(现http://picxxxx.top/)网站信息

日期: 2023-01-19 22:44:10

python爬取并存储nsfwpics网页源码(现网址http://picxxxx.top/),存储图片链接和图片来源网站并存入mysql。

还没有开始爬图,目前存储了334*6页的网页源码,和这些网页上的图片信息。

为什么以前的一些图片不能访问了?有没有hxd以前爬过的?我想要2020-5-18之后更新的图片。

使用mysql 8.0.23 社区版,python3.8.5。

mysqldump备份出来的库:https://pan.baidu.com/s/1m0z1V6t-bRk5nXHj_Wyydw FULI

源码,写的很乱(需要自行修改):

import requests

from requests.adapters import HTTPAdapter

from bs4 import BeautifulSoup

import time

import pymysql

import logging

import re

import os

import traceback

root_url = 'http://picxxxx.top/'

start_page, end_page = 1, 10

url_p = re.compile(r'''^http://picxxxx.top/.+.html

'', flags=re.I)

img_se_path = r'D:spider-picxxxxpics'

video_se_path = r'D:spider-picxxxxvideos'

# 目录页面:

# Link未命名页面

#

headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 Edg/88.0.705.63'

}

def get_logger():

try:

formatter = logging.Formatter(

'%(lineno)d : %(asctime)s : %(levelname)s : %(funcName)s : %(message)s')

fileHandler = logging.FileHandler(

'D:spider-picxxxx运行记录{}.txt'.format(time.strftime('-%Y-%m-%d')), mode='w', encoding='utf-8')

fileHandler.setFormatter(formatter)

log = logging.getLogger('logger')

log.setLevel(logging.DEBUG)

log.addHandler(fileHandler)

return log

except:

print('logger初始化出错')

return None

start_time = time.process_time()

log = get_logger()

msg = '开始时间:{}'.format(time.strftime('%Y-%m-%d %H:%M:%S'))

print(msg)

log.info(msg)

conn = pymysql.connect(host='127.0.0.1',

user='user',

passwd='passwd',

db='nsfwpic',

charset='utf8')

cur = conn.cursor()

# 设置重连次数。包括开头,若出现异常,最多尝试连接4次

session = requests.Session()

session.mount('http://', HTTPAdapter(max_retries=3))

session.mount('https://', HTTPAdapter(max_retries=3))

def make_soup(url):

xml = session.get(url, timeout=60, headers=headers)

bsObj = BeautifulSoup(xml.text, 'lxml')

xml.encoding = xml.apparent_encoding

return bsObj

def get_last_page(root_url):

bsObj = make_soup(root_url)

last_page = bsObj.find('a', {'href': re.compile(

r'''^http://picxxxx.top/page/d{3}/

'', flags=re.I)})

start, end = re.search(re.compile(r'''d{3}'''), last_page['href']).span()

#

last_page = last_page['href'][start:end]

last_page = int(last_page)

msg = '现总页数为{}'.format(last_page)

log.info(msg)

print(msg)

return last_page

def get_new_pics_pages(root_url):

last_page = get_last_page(root_url)

index_pages = []

msg = '本次从{}页到{}页'.format(start_page, end_page)

log.info(msg)

print(msg)

for i in range(start_page, end_page+1):

index_pages.append(r'http://picxxxx.top/page/'+str(i)+'/')

for index_page in index_pages:

bsObj = make_soup(index_page)

url_tags = bsObj.find_all('a', {'href': url_p})

#

urls = []

for url_tag in url_tags:

url = url_tag['href']

urls.append(url)

for url in urls:

cur.execute(

'''SELECT no FROM picxxxx_pages WHERE url="%s";''' % (url))

existence = cur.fetchall()

if len(existence) == 0:

cur.execute('''INSERT INTO picxxxx_pages (url) VALUES ("%s");''' % (

url))

conn.commit()

log.info('发现新页面:{}'.format(url))

else:

log.info('旧页面{}'.format(url))

time.sleep(1)

def se_new_pics_pages_content(root_url):

cur.execute('SELECT url FROM picxxxx_pages WHERE content is NULL')

new_pics_pages = cur.fetchall()

if len(new_pics_pages) == 0:

return

for url in new_pics_pages:

url = url[0]

bsObj = make_soup(url)

content = str(bsObj)

content = content.replace('', '\')

content = content.replace('"', '"')

content = content.replace("'", "'")

if content != None:

cur.execute('''UPDATE picxxxx_pages SET content="%s" WHERE url="%s";''' % (

content, url))

conn.commit()

log.info('抓取页面:{}'.format(url))

else:

cur.execute('''INSERT INTO picxxxx_pages (url,content) VALUES ("%s","unknown");''' % (

url))

conn.commit()

log.info('失败抓取页面:{}'.format(url))

time.sleep(1)

def update_img_urls():

cur.execute(

'SELECT url,content FROM picxxxx_pages WHERE checked=0 AND content!="unknown" AND content IS NOT NULL;')

to_do_list = cur.fetchall()

for url, content in to_do_list:

img_urls = []

content = content.replace('\', '')

content = content.replace('"', '"')

content = content.replace("'", "'")

bsObj = BeautifulSoup(content, 'lxml')

img_tags = bsObj.find_all('img')

for img_tag in img_tags:

for attr in ['src', 'data-src']:

img_url = img_tag.get(attr)

if img_url != None and img_url not in img_urls:

img_urls.append(img_url)

img_urls = set(img_urls)

img_urls = list(img_urls)

img_urls.sort()

for img_url in img_urls:

cur.execute(

'SELECT no FROM picxxxx_pics WHERE img_url="%s";' % (img_url))

existence = cur.fetchall()

if len(existence) != 0:

continue

else:

cur.execute(

'''INSERT INTO picxxxx_pics (url,img_url) VALUES ("%s","%s");''' % (url, img_url))

conn.commit()

msg = '图片入库{}'.format(img_url)

log.info(msg)

cur.execute(

'UPDATE picxxxx_pages SET checked=1 WHERE url="%s";' % (url))

conn.commit()

def download_img():

cur.execute('''SELECT url,img_url FROM picxxxx_pics WHERE downloaded=0;''')

items = cur.fetchall()

total_num = len(items)

for item in items:

num = items.index(item)+1

url, img_url = item

headers['Referer'] = url # 加上来源网站,规避网站的反盗链

img = session.get(img_url, timeout=30, headers=headers)

img = img.content

img_name = img_url.split('#')[0]

img_name = img_name.split('/')[-1]

img_name = time.strftime('%Y-%m-%d-%H%M%S')+img_name

place = os.path.join(img_se_path, img_name)

with open(place, 'wb') as f:

f.write(img)

f.close()

msg = '图片下载成功{}/{},{:>100}'.format(num, total_num, img_url)

print(' '+msg, end='')

log.info(msg)

place = place.replace('', '\')

place = place.replace('"', '"')

place = place.replace("'", "'")

cur.execute(

'UPDATE picxxxx_pics SET place="%s",downloaded=1 where img_url="%s";' % (place, img_url))

conn.commit()

time.sleep(0.5)

# https://search.pstatic.net/common?src=https://i.imgur.com/DbxbIr4.jpg#vwid=853&vhei=1280

#

# 视频

#

def main():

# get_new_pics_pages(root_url)

# se_new_pics_pages_content(root_url)

update_img_urls()

# download_img()

# update_video_urls()

# download_video()

if __name__ == '__main__':

try:

main()

except Exception as e:

traceback.print_exc()

log.error(traceback.format_exc())

conn = pymysql.connect(host='127.0.0.1',

user='user',

passwd='passwd',

db='nsfwpic',

charset='utf8')

cur = conn.cursor()

log.error('休息15分钟,重新出发')

time.sleep(900)

main()

finally:

cur.close()

conn.close()

# nsfwpic数据库

'''CREATE TABLE picxxxx_pages(从目录页上爬下来的图片页

no INT UNSIGNED NOT NULL AUTO_INCREMENT,编号

url VARCHAR(255) NOT NULL DEFAULT "unknown",域名

checked TINYINT NOT NULL DEFAULT 0,页面上的图片是否已入库

content MEDIUMTEXT,页面内容(原html格式)

created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,建档时间

PRIMARY KEY(no));

'''

'''CREATE TABLE picxxxx_pics(图片信息库

no INT UNSIGNED NOT NULL AUTO_INCREMENT,编号

url VARCHAR(255) NOT NULL DEFAULT "unknown",来自哪个网页

img_url VARCHAR(255) NOT NULL DEFAULT "unknown",图片域名

place VARCHAR(255) NOT NULL DEFAULT "unknown",存放位置

downloaded TINYINT NOT NULL DEFAULT 0,是否下载过

created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,建档时间

PRIMARY KEY(no));

'''

复制代码

网站 信息 nsfw


上一篇:免费3天优酷正式会员!!

下一篇:艾尔登法环

  • 在线客服

    官方微信

    仅处理投诉、举报及平台使用问题;
    商品问题请咨询商家客服!

浏览记录