欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

利用xpath爬取网上数据,并存储到django模型中

程序员文章站 2022-03-30 11:23:18
帮朋友制作一个网站,需要一些产品数据信息,因为是代理其他公司产品,直接爬取代理公司产品数据1.设计数据库from django.db import modelsfrom uuslug import slugifyimport uuidimport osdef products_directory_path(instance, filename): ext = filename.split('.')[-1] filename = '{}.{}'.format(uuid.uuid...

帮朋友制作一个网站,需要一些产品数据信息,因为是代理其他公司产品,直接爬取代理公司产品数据

1.设计数据库

from django.db import models
from uuslug import slugify
import uuid
import os


def products_directory_path(instance, filename):
    ext = filename.split('.')[-1]
    filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext)
    # return the whole path to the file
    return os.path.join('images', "products", instance.title, filename)


def product_relatedimage_directory_path(instance, filename):
    ext = filename.split('.')[-1]
    filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext)
    # return the whole path to the file
    return os.path.join('images', "product_relatedimage", instance.product.title, filename)


class ProductsCategory(models.Model):
    """产品分类"""
    name = models.CharField('产品分类名', max_length=80, unique=True)
    description = models.TextField('产品分类描述', blank=True, null=True)
    slug = models.SlugField('slug', max_length=80, blank=True, null=True)
    parent_category = models.ForeignKey('self', verbose_name="父级分类", blank=True, null=True, on_delete=models.CASCADE)

    def save(self, *args, **kwargs):
        if not self.id or not self.slug:
            self.slug = slugify(self.name)
        super().save(*args, **kwargs)

    def __str__(self):
        return self.name

    class Meta:
        ordering = ['name']
        verbose_name = "产品分类"
        verbose_name_plural = verbose_name


class ProductsTag(models.Model):
    """产品标签"""
    name = models.CharField('产品标签名', max_length=30, unique=True)
    slug = models.SlugField('slug', max_length=40)

    def __str__(self):
        return self.name

    def save(self, *args, **kwargs):
        if not self.id or not self.slug:
            self.slug = slugify(self.name)
        super().save(*args, **kwargs)

    class Meta:
        ordering = ['name']
        verbose_name = "产品标签"
        verbose_name_plural = verbose_name


class Product(models.Model):
    title = models.CharField('标题', max_length=255, unique=True)
    slug = models.SlugField('slug', max_length=255, blank=True, null=True)
    jscs = models.TextField('技术参数', blank=True, null=True)
    image = models.ImageField(upload_to=products_directory_path, verbose_name="产品图片")
    views = models.PositiveIntegerField('浏览量', default=0)
    category = models.ForeignKey('ProductsCategory', verbose_name='分类', on_delete=models.CASCADE, blank=True, null=True)
    tags = models.ManyToManyField('ProductsTag', verbose_name='标签集合', blank=True)

    def save(self, *args, **kwargs):
        if not self.id or not self.slug:
            self.slug = slugify(self.title)
        super().save(*args, **kwargs)

    def update_views(self):
        self.views += 1
        self.save(update_fields=['views'])

    def get_pre(self):
        return Product.objects.filter(id__lt=self.id).order_by('-id').first()

    def get_next(self):
        return Product.objects.filter(id__gt=self.id).order_by('id').first()

    def __str__(self):
        return self.title

    class Meta:
        verbose_name = "产品"
        verbose_name_plural = verbose_name


class ProductAdvantage(models.Model):
    content = models.TextField('产品优势', blank=True, null=True)
    product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True)

    def __str__(self):
        return self.content

    class Meta:
        verbose_name = "产品优势"
        verbose_name_plural = verbose_name


class ProductBody(models.Model):
    body = models.CharField('产品内容', max_length=256, blank=True, null=True)
    product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True)

    def __str__(self):
        return self.product.title

    class Meta:
        verbose_name = "产品内容"
        verbose_name_plural = verbose_name

2.脚本编写

2.1编写获取网页源代码函数

def get_one_page(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
        res = requests.get(url=url, headers=headers)
        res.encoding = 'utf-8'
        if res.status_code == 200:
            return res.text
        else:
            return None
    except Exception:
        return None

2.2根据base页面获取所有产品分类页面链接

if __name__ == '__main__':
    content = get_one_page(url)
    tree = etree.HTML(content)
    # 产品分类url
    catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
    # 处理catgory_urls
    for url in catgory_urls:
        url = 'http://www.kexinjianji.com' + url
        print(url)

2.3根据产品分类页面链接获取对应所有产品链接

if __name__ == '__main__':
    content = get_one_page(url)
    tree = etree.HTML(content)
    # 产品分类
    catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
    print("产品分类:" + catgory[0])
    # 该分类下产品url
    urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
    # 处理url
    for url in urls:
        url = 'http://www.kexinjianji.com' + url
        print(url)
    print("=====================================================")

两者结合起来就可以打印出所有产品链接

if __name__ == '__main__':
    content = get_one_page(url)
    tree = etree.HTML(content)
    # 产品分类url
    catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
    # 处理catgory_urls
    for url in catgory_urls:
        url = 'http://www.kexinjianji.com' + url
        content = get_one_page(url)
        tree = etree.HTML(content)
        # 产品分类
        catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
        print("产品分类:" + catgory[0])
        # 该分类下产品url
        urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
        # 处理url
        for url in urls:
            url = 'http://www.kexinjianji.com' + url
            print(url)
        print("=====================================================")

2.2使用xpath解析函数返回产品链接的内容

if __name__ == '__main__':
    content = get_one_page(url)
    tree = etree.HTML(content)
    # 产品名称
    title = tree.xpath('//*[@id="wrap"]//h1/text()')
    images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
    # 产品图片
    images_url = 'http://www.kexinjianji.com/' + images[0]
    # 性能特点
    xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
    # 技术参数
    jscs = tree.xpath('//table')[0]
    jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
    # 产品内容
    cpnr = tree.xpath('//div[@class="describe"]/p')
    print('产品名称:' + title[0])
    print('产品图片:' + images_url)
    for td in xntd:
        print('性能特点:' + td)
    print('技术参数:' + jscs_str)
    for cp in cpnr:
        # string(.) 获取当前标签下所有文本内容
        cp = cp.xpath('string(.)')
        print('产品内容:' + cp)
    print('============================================')

将三者结合在一起就可以获取所有产品信息

if __name__ == '__main__':
    content = get_one_page(url)
    tree = etree.HTML(content)
    # 产品分类url
    catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
    # 处理catgory_urls
    for url in catgory_urls:
        url = 'http://www.kexinjianji.com' + url
        content = get_one_page(url)
        tree = etree.HTML(content)
        # 产品分类
        catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
        # 该分类下产品url
        urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
        # 处理url
        for url in urls:
            url = 'http://www.kexinjianji.com' + url
            content = get_one_page(url)
            try:
                tree = etree.HTML(content)
                # 产品名称
                title = tree.xpath('//*[@id="wrap"]//h1/text()')
                images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
                # 产品图片
                images_url = 'http://www.kexinjianji.com' + images[0]
                # 性能特点
                xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
                # 技术参数
                jscs = tree.xpath('//table')[0]
                jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
                # 产品内容
                cpnr = tree.xpath('//div[@class="describe"]/p')
                print("产品分类:" + catgory[0])
                print('产品链接:' + url)
                print('产品名称:' + title[0])
                print('产品图片:' + images_url)
                for td in xntd:
                    print('性能特点:' + td.strip())
                # print('技术参数:' + jscs_str)
                for cp in cpnr:
                    # string(.) 获取当前标签下所有文本内容
                    cp = cp.xpath('string(.)')
                    print('产品内容:' + cp)
                print('============================================')
            except Exception as e:
                print(e)
                print('出错url:' + url)
                pass

3.存储到django模型

import requests
from lxml.html import etree
import os
import django
import uuid
from django.core.files.base import ContentFile

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "jiaobanzhan.settings")
django.setup()

from products.models import ProductBody, ProductsCategory, Product, ProductAdvantage

url = 'http://www.kexinjianji.com/product/hzshntjbz_1/'


def get_one_page(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
        res = requests.get(url=url, headers=headers, timeout=10)
        res.encoding = 'utf-8'
        if res.status_code == 200:
            return res.text
        else:
            return None
    except Exception:
        print('aa')
        return None


if __name__ == '__main__':
    content = get_one_page(url)
    tree = etree.HTML(content)
    # 产品分类url
    catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
    # 处理catgory_urls
    for url in catgory_urls:
        url = 'http://www.kexinjianji.com' + url
        content = get_one_page(url)
        tree = etree.HTML(content)
        # 产品分类
        p_catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
        # 该分类下产品url
        urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
        # 处理url
        for url in urls:
            url = 'http://www.kexinjianji.com' + url
            content = get_one_page(url)
            try:
                tree = etree.HTML(content)
                # 产品名称
                title = tree.xpath('//*[@id="wrap"]//h1/text()')
                images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
                # 产品图片
                images_url = 'http://www.kexinjianji.com' + images[0]
                # 性能特点
                xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
                # 技术参数
                jscs = tree.xpath('//table')[0]
                jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
                # 产品内容
                cpnr = tree.xpath('//div[@class="describe"]/p')
                # 判断是否有这分类,没有则新建
                catgory = p_catgory[0]
                products_catgory = ProductsCategory.objects.filter(name=catgory).exists()
                if products_catgory:
                    products_catgory = ProductsCategory.objects.get(name=catgory)
                else:
                    products_catgory = ProductsCategory(name=catgory)
                    products_catgory.save()
                print(products_catgory)

                # 保存产品图片
                image_content = requests.get(url=images_url)
                ext = images_url.split('.')[-1]  # 获取图片类型
                filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext)  # 随机生成图片名字
                upload_image_file = ContentFile(image_content.content, name=filename)  # 将图片保存为django类型
                product = Product(title=title[0], jscs=jscs_str, image=upload_image_file, category=products_catgory)
                product.save()
                for td in xntd:
                    product_advantage = ProductAdvantage()
                    product_advantage.content = td
                    product_advantage.product = product
                    product_advantage.save()
                for cp in cpnr:
                    cp = cp.xpath('string(.)')
                    product_body = ProductBody()
                    product_body.body = cp
                    product_body.product = product
                    product_body.save()
            except Exception as e:
                print(e)
                print('出错url:' + url)

最后自己手动处理出错url(页面没有获取到技术参数,技术参数是一张图片)

4.总结

1.xpath 获取标签内容时,p标签中嵌套span标签,源码如下

<div class="describe" style="position: relative;"> 
      <p><span>板    宽:</span>1500mm</p> 
      <p><span>板    厚:</span>4.5 mm</p> 
      <p><span>出料口:</span>6口</p> 
      <p><span>重    量:</span>6000 kg</p>
</div>

使用xpath获取p标签内容
我想得到的效果如下
板 宽:1500mm
板 厚:4.5 mm
出料口:6口
重 量:6000 kg
使用以下xpath 只能分开获取,不是想要的效果

//div[@class="describe"]/p/span/text()|//div[@class="describe"]/p/text()

百度之后找到的解决办法,使用xpath(‘string(.)’)
1.先获取所有p标签

cpnr = tree.xpath('//div[@class="describe"]/p')

2.使用**string(.)**获取所有标签所有文本

cp = cp.xpath('string(.)')

循环遍历所有p标签即可

本文地址:https://blog.csdn.net/cll_869241/article/details/114005783