From 5063429ab840bea76b14c08e5ae470d1f128177e Mon Sep 17 00:00:00 2001 From: "stephen.yu" Date: Wed, 13 Mar 2024 10:02:23 +0800 Subject: [PATCH] first commit --- .idea/.gitignore | 8 + .idea/RecipeCrawler.iml | 14 + .../inspectionProfiles/profiles_settings.xml | 6 + .idea/misc.xml | 7 + .idea/vcs.xml | 6 + README.md | 0 Recipe/__init__.py | 0 Recipe/admin.py | 3 + Recipe/apps.py | 6 + Recipe/migrations/0001_initial.py | 57 +++ .../migrations/0002_dish_insoluble_fiber.py | 18 + ...er_dish_ca_alter_dish_calories_and_more.py | 158 ++++++++ Recipe/migrations/__init__.py | 0 Recipe/models.py | 45 +++ Recipe/tests.py | 3 + Recipe/views.py | 336 ++++++++++++++++++ RecipeCrawler/__init__.py | 0 RecipeCrawler/asgi.py | 16 + RecipeCrawler/settings.py | 125 +++++++ RecipeCrawler/urls.py | 24 ++ RecipeCrawler/wsgi.py | 16 + manage.py | 23 ++ recipe.py | 24 ++ 23 files changed, 895 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/RecipeCrawler.iml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/vcs.xml create mode 100644 README.md create mode 100644 Recipe/__init__.py create mode 100644 Recipe/admin.py create mode 100644 Recipe/apps.py create mode 100644 Recipe/migrations/0001_initial.py create mode 100644 Recipe/migrations/0002_dish_insoluble_fiber.py create mode 100644 Recipe/migrations/0003_alter_dish_ca_alter_dish_calories_and_more.py create mode 100644 Recipe/migrations/__init__.py create mode 100644 Recipe/models.py create mode 100644 Recipe/tests.py create mode 100644 Recipe/views.py create mode 100644 RecipeCrawler/__init__.py create mode 100644 RecipeCrawler/asgi.py create mode 100644 RecipeCrawler/settings.py create mode 100644 RecipeCrawler/urls.py create mode 100644 RecipeCrawler/wsgi.py create mode 100644 manage.py create mode 100644 recipe.py diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/RecipeCrawler.iml b/.idea/RecipeCrawler.iml new file mode 100644 index 0000000..e2e8179 --- /dev/null +++ b/.idea/RecipeCrawler.iml @@ -0,0 +1,14 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..6d2518d --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/Recipe/__init__.py b/Recipe/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Recipe/admin.py b/Recipe/admin.py new file mode 100644 index 0000000..8c38f3f --- /dev/null +++ b/Recipe/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/Recipe/apps.py b/Recipe/apps.py new file mode 100644 index 0000000..1d52408 --- /dev/null +++ b/Recipe/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class RecipeConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'Recipe' diff --git a/Recipe/migrations/0001_initial.py b/Recipe/migrations/0001_initial.py new file mode 100644 index 0000000..b458ae6 --- /dev/null +++ b/Recipe/migrations/0001_initial.py @@ -0,0 +1,57 @@ +# Generated by Django 5.0.2 on 2024-03-05 08:14 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='Dish', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=255, unique=True)), + ('image', models.TextField()), + ('likes', models.IntegerField(default=0)), + ('tags', models.TextField()), + ('indications', models.TextField()), + ('Calories', models.CharField(max_length=255, unique=True)), + ('Salt', models.CharField(max_length=255, unique=True)), + ('Protein', models.CharField(max_length=255, unique=True)), + ('Total_fat', models.CharField(max_length=255, unique=True)), + ('Total_Carbohydrate', models.CharField(max_length=255, unique=True)), + ('Total_sugar', models.CharField(max_length=255, unique=True)), + ('Dietary_fiber', models.CharField(max_length=255, unique=True)), + ('Soluble_fiber', models.CharField(max_length=255, unique=True)), + ('K', models.CharField(max_length=255, unique=True)), + ('Ca', models.CharField(max_length=255, unique=True)), + ('Mg', models.CharField(max_length=255, unique=True)), + ('P', models.CharField(max_length=255, unique=True)), + ('Fe', models.CharField(max_length=255, unique=True)), + ('Zn', models.CharField(max_length=255, unique=True)), + ('I', models.CharField(max_length=255, unique=True)), + ('Cholesterol', models.CharField(max_length=255, unique=True)), + ('Vitamin_B1', models.CharField(max_length=255, unique=True)), + ('Vitamin_B2', models.CharField(max_length=255, unique=True)), + ('Vitamin_C', models.CharField(max_length=255, unique=True)), + ('Vitamin_B6', models.CharField(max_length=255, unique=True)), + ('Vitamin_B12', models.CharField(max_length=255, unique=True)), + ('Folate', models.CharField(max_length=255, unique=True)), + ('Vitamin_A', models.CharField(max_length=255, unique=True)), + ('Vitamin_D', models.CharField(max_length=255, unique=True)), + ('Vitamin_K', models.CharField(max_length=255, unique=True)), + ('Vitamin_E', models.CharField(max_length=255, unique=True)), + ('Saturated_fatty_acid', models.CharField(max_length=255, unique=True)), + ('Monounsaturated_fatty_acid', models.CharField(max_length=255, unique=True)), + ('Polyunsaturated_fatty_acid', models.CharField(max_length=255, unique=True)), + ('Ingredients', models.TextField()), + ('Steps', models.JSONField()), + ('Step_images_Base64', models.JSONField()), + ], + ), + ] diff --git a/Recipe/migrations/0002_dish_insoluble_fiber.py b/Recipe/migrations/0002_dish_insoluble_fiber.py new file mode 100644 index 0000000..9b19949 --- /dev/null +++ b/Recipe/migrations/0002_dish_insoluble_fiber.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.2 on 2024-03-06 06:06 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('Recipe', '0001_initial'), + ] + + operations = [ + migrations.AddField( + model_name='dish', + name='Insoluble_fiber', + field=models.CharField(blank=True, max_length=255, null=True), + ), + ] diff --git a/Recipe/migrations/0003_alter_dish_ca_alter_dish_calories_and_more.py b/Recipe/migrations/0003_alter_dish_ca_alter_dish_calories_and_more.py new file mode 100644 index 0000000..17e1df6 --- /dev/null +++ b/Recipe/migrations/0003_alter_dish_ca_alter_dish_calories_and_more.py @@ -0,0 +1,158 @@ +# Generated by Django 5.0.2 on 2024-03-06 06:23 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('Recipe', '0002_dish_insoluble_fiber'), + ] + + operations = [ + migrations.AlterField( + model_name='dish', + name='Ca', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Calories', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Cholesterol', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Dietary_fiber', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Fe', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Folate', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='I', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='K', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Mg', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Monounsaturated_fatty_acid', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='P', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Polyunsaturated_fatty_acid', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Protein', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Salt', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Saturated_fatty_acid', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Soluble_fiber', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Total_Carbohydrate', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Total_fat', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Total_sugar', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Vitamin_A', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Vitamin_B1', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Vitamin_B12', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Vitamin_B2', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Vitamin_B6', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Vitamin_C', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Vitamin_D', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Vitamin_E', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Vitamin_K', + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AlterField( + model_name='dish', + name='Zn', + field=models.CharField(blank=True, max_length=255, null=True), + ), + ] diff --git a/Recipe/migrations/__init__.py b/Recipe/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Recipe/models.py b/Recipe/models.py new file mode 100644 index 0000000..ecc0288 --- /dev/null +++ b/Recipe/models.py @@ -0,0 +1,45 @@ +from django.db import models +from django.contrib.postgres.fields import JSONField + + + +# Create your models here. +class Dish(models.Model): + name = models.CharField(max_length=255, unique=True) # 假设每个菜名是唯一的 + image = models.TextField() # 存储图片的 Base64 编码 + likes = models.IntegerField(default=0) + tags = models.TextField() # 可以存储标签列表,例如以逗号分隔的字符串 + indications = models.TextField() + Calories = models.CharField(max_length=255, unique=False, blank=True, null=True) + Salt = models.CharField(max_length=255, unique=False, blank=True, null=True) + Protein = models.CharField(max_length=255, unique=False, blank=True, null=True) + Total_fat = models.CharField(max_length=255, unique=False, blank=True, null=True) + Total_Carbohydrate = models.CharField(max_length=255, unique=False, blank=True, null=True) + Total_sugar = models.CharField(max_length=255, unique=False, blank=True, null=True) + Dietary_fiber = models.CharField(max_length=255, unique=False, blank=True, null=True) + Soluble_fiber = models.CharField(max_length=255, unique=False, blank=True, null=True) + Insoluble_fiber = models.CharField(max_length=255, unique=False, blank=True, null=True) + K = models.CharField(max_length=255, unique=False, blank=True, null=True) + Ca = models.CharField(max_length=255, unique=False, blank=True, null=True) + Mg = models.CharField(max_length=255, unique=False, blank=True, null=True) + P = models.CharField(max_length=255, unique=False, blank=True, null=True) + Fe = models.CharField(max_length=255, unique=False, blank=True, null=True) + Zn = models.CharField(max_length=255, unique=False, blank=True, null=True) + I = models.CharField(max_length=255, unique=False, blank=True, null=True) + Cholesterol = models.CharField(max_length=255, unique=False, blank=True, null=True) + Vitamin_B1 = models.CharField(max_length=255, unique=False, blank=True, null=True) + Vitamin_B2 = models.CharField(max_length=255, unique=False, blank=True, null=True) + Vitamin_C = models.CharField(max_length=255, unique=False, blank=True, null=True) + Vitamin_B6 = models.CharField(max_length=255, unique=False, blank=True, null=True) + Vitamin_B12 = models.CharField(max_length=255, unique=False, blank=True, null=True) + Folate = models.CharField(max_length=255, unique=False, blank=True, null=True) + Vitamin_A = models.CharField(max_length=255, unique=False, blank=True, null=True) + Vitamin_D = models.CharField(max_length=255, unique=False, blank=True, null=True) + Vitamin_K = models.CharField(max_length=255, unique=False, blank=True, null=True) + Vitamin_E = models.CharField(max_length=255, unique=False, blank=True, null=True) + Saturated_fatty_acid = models.CharField(max_length=255, unique=False, blank=True, null=True) + Monounsaturated_fatty_acid = models.CharField(max_length=255, unique=False, blank=True, null=True) + Polyunsaturated_fatty_acid = models.CharField(max_length=255, unique=False, blank=True, null=True) + Ingredients = models.TextField() + Steps = models.JSONField() + Step_images_Base64 = models.JSONField() \ No newline at end of file diff --git a/Recipe/tests.py b/Recipe/tests.py new file mode 100644 index 0000000..7ce503c --- /dev/null +++ b/Recipe/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/Recipe/views.py b/Recipe/views.py new file mode 100644 index 0000000..78b27a5 --- /dev/null +++ b/Recipe/views.py @@ -0,0 +1,336 @@ +import asyncio +import base64 +import pandas as pd +from pandas import DataFrame +from playwright.async_api import Playwright, async_playwright, Error +from asgiref.sync import sync_to_async +import time +from django.http import HttpResponseBadRequest, HttpResponse +from django.views import View +from Recipe.models import Dish + + +class RecipeCrawlerView(View): + def get(self, request): + sync_main() + return HttpResponse("Success!", content_type="text/plain") + + +# 因應django ORM,以同步函數執行非同步程式 +def sync_main(): + # 使用sync_to_async包装器調用非同步函數 + asyncio.run(main()) + + +async def save_to_db(dish_data): + # 将异步ORM操作转换为同步,以适应Django ORM + dish, created = await sync_to_async(Dish.objects.update_or_create)( + name=dish_data['name'], + defaults=dish_data + ) + action = "added" if created else "updated" + print(f"Dish '{dish.name}' was {action}.") + + +# 將圖片轉為base64 +async def fetch_image_as_base64(page, image_url): + print("Fetching image from URL:", image_url) + try: + response = await page.request.get(image_url) + if response.ok: + image_data = await response.body() + image_base64 = base64.b64encode(image_data).decode() + return image_base64 + except Exception as e: + print(f"Error fetching image: {e}") + return None + + +async def run(playwright: Playwright): + browser = await playwright.chromium.launch(headless=False) + context = await browser.new_context() + page = await context.new_page() + await page.goto("https://oishi-kenko.com/recipes") + await page.get_by_role("link", name="ログイン").click() + await page.get_by_role("link", name="メールアドレス でログイン").click() + await page.locator("#secure_account_credential_email").click() + await page.locator("#secure_account_credential_email").fill("asd851117005545@gmail.com") + await page.locator("#secure_account_credential_email").press("Tab") + await page.locator("#secure_account_credential_password").fill("a22897051") + await page.get_by_role("button", name="ログイン").click() + await page.goto("https://oishi-kenko.com/recipes") + + + + max_retries = 3 + + while True: + # 訂位到所有匹配連結 + links = page.locator('a.p-recipe-list-item__title-link') + + # 獲取連結數量 + link_count = await links.count() + + # 點擊每個連結 + for i in range(link_count): + retry_count = 0 # 设置重试次数计数器 + while retry_count < 3: # 假设最多重试3次 + # 使用 nth(i) 定位第 i 個元素,並點擊 + + try: + await page.locator('a.p-recipe-list-item__title-link').nth(i).click() + + # 等待頁面 + await page.wait_for_load_state('networkidle') + # await asyncio.sleep(5) + + print("------菜名-----") + # 輸出名稱 + dishname = await page.text_content('.p-recipe-detail__title') + print(dishname) + dishname_clean = dishname.strip().replace('\n', '') + + # 菜名圖片 + image_element = page.locator('.p-recipe-detail__photo-image--pc-only') + + # 从元素的 'src' 属性中获取图片的 URL + image_url = await image_element.get_attribute('src') + + # 确保获取到的 URL 不为空 + if image_url: + # 获取图片的 Base64 编码 + Dish_image_base64 = await fetch_image_as_base64(page, image_url) + else: + Dish_image_base64 = None + + # print("------按讚數-----") + # 輸出按讚數 + Like_count = await page.text_content('.c-button-circle__top-text') + # print(Likes_count) + Likes_count_clean = Like_count.strip().replace('\n', '') + + # print("------標籤-----") + # 輸出標籤 + tags = await page.locator('.c-button-round-tag__link').all_text_contents() + # for tag in tags: + # print(tag) + tags_clean = [tag.strip() for tag in tags] + + # print("------適應症-----") + # 輸出適應症 + Indications = await page.locator('.c-recipes-relevant-dietary-concerns__text').all_text_contents() + # for Indication in Indications: + # print(Indication) + Indications_clean = [Indication.strip() for Indication in Indications] + + # print("------營養標示-----") + # 輸出營養標示 + + # 各營養標示分類 + Calorie = '' + Salt = '' + Protein = '' + Fat = '' + Carbohydrate = '' + Sugar = '' + Dietary_fiber = '' + Soluble_fiber = '' + Insoluble_fiber = '' + Potassium = '' + Calcium = '' + Magnesium = '' + Phosphorous = '' + Iron = '' + Zinc = '' + Iodine = '' + Cholesterol = '' + Vitamin_B1 = '' + Vitamin_B2 = '' + Vitamin_C = '' + Vitamin_B6 = '' + Vitamin_B12 = '' + Folate = '' + Vitamin_A = '' + Vitamin_D = '' + Vitamin_K = '' + Vitamin_E = '' + Saturated_fatty_acid = '' + Monounsaturated_fatty_acid = '' + Polyunsaturated_fatty_acid = '' + + nutritions1 = await page.locator('.c-nutrition-table__cell--1').all_text_contents() + # for nutrition in nutritions1: + # print(nutrition) + # nutritions_clean = [nutrition.strip().replace('\n', '') for nutrition in nutritions] + for nutrition1 in nutritions1: + if 'エネルギー' in nutrition1: + Calorie = nutrition1.split('エネルギー')[1].strip().replace('\n', '') + if '食塩相当量' in nutrition1: + Salt = nutrition1.split('食塩相当量')[1].strip().replace('\n', '') + if 'たんぱく質' in nutrition1: + Protein = nutrition1.split('たんぱく質')[1].strip().replace('\n', '') + if '脂質' in nutrition1: + Fat = nutrition1.split('脂質')[1].strip().replace('\n', '') + if '炭水化物' in nutrition1: + Carbohydrate = nutrition1.split('炭水化物')[1].strip().replace('\n', '') + if '糖質' in nutrition1: + Sugar = nutrition1.split('糖質')[1].strip().replace('\n', '') + if '食物繊維' in nutrition1: + Dietary_fiber = nutrition1.split('食物繊維')[1].strip().replace('\n', '') + if '水溶性食物繊維' in nutrition1: + Soluble_fiber = nutrition1.split('水溶性食物繊維')[1].strip().replace('\n', '') + if '不溶性食物繊維' in nutrition1: + Insoluble_fiber = nutrition1.split('不溶性食物繊維')[1].strip().replace('\n', '') + if 'カリウム' in nutrition1: + Potassium = nutrition1.split('カリウム')[1].strip().replace('\n', '') + + nutritions2 = await page.locator('.c-nutrition-table__cell--2').all_text_contents() + + for nutrition2 in nutritions2: + if 'カルシウム' in nutrition2: + Calcium = nutrition2.split('カルシウム')[1].strip().replace('\n', '') + if 'マグネシウム' in nutrition2: + Magnesium = nutrition2.split('マグネシウム')[1].strip().replace('\n', '') + if 'リン' in nutrition2: + Phosphorous = nutrition2.split('リン')[1].strip().replace('\n', '') + if '鉄' in nutrition2: + Iron = nutrition2.split('鉄')[1].strip().replace('\n', '') + if '亜鉛' in nutrition2: + Zinc = nutrition2.split('亜鉛')[1].strip().replace('\n', '') + if 'ヨウ素' in nutrition2: + Iodine = nutrition2.split('ヨウ素')[1].strip().replace('\n', '') + if 'コレステロール' in nutrition2: + Cholesterol = nutrition2.split('コレステロール')[1].strip().replace('\n', '') + if 'ビタミンB1' in nutrition2: + Vitamin_B1 = nutrition2.split('ビタミンB1')[1].strip().replace('\n', '') + if 'ビタミンB2' in nutrition2: + Vitamin_B2 = nutrition2.split('ビタミンB2')[1].strip().replace('\n', '') + if 'ビタミンC' in nutrition2: + Vitamin_C = nutrition2.split('ビタミンC')[1].strip().replace('\n', '') + + nutritions3 = await page.locator('.c-nutrition-table__cell--3').all_text_contents() + + for nutrition3 in nutritions3: + if 'ビタミンB6' in nutrition3: + Vitamin_B6 = nutrition3.split('ビタミンB6')[1].strip().replace('\n', '') + if 'ビタミンB12' in nutrition3: + Vitamin_B12 = nutrition3.split('ビタミンB12')[1].strip().replace('\n', '') + if '葉酸' in nutrition3: + Folate = nutrition3.split('葉酸')[1].strip().replace('\n', '') + if 'ビタミンA' in nutrition3: + Vitamin_A = nutrition3.split('ビタミンA')[1].strip().replace('\n', '') + if 'ビタミンD' in nutrition3: + Vitamin_D = nutrition3.split('ビタミンD')[1].strip().replace('\n', '') + if 'ビタミンK' in nutrition3: + Vitamin_K = nutrition3.split('ビタミンK')[1].strip().replace('\n', '') + if 'ビタミンE' in nutrition3: + Vitamin_E = nutrition3.split('ビタミンE')[1].strip().replace('\n', '') + if '飽和脂肪酸' in nutrition3: + Saturated_fatty_acid = nutrition3.split('飽和脂肪酸')[1].strip().replace('\n', '') + if '一価不飽和脂肪酸' in nutrition3: + Monounsaturated_fatty_acid = nutrition3.split('一価不飽和脂肪酸')[1].strip().replace('\n', '') + if '多価不飽和脂肪酸' in nutrition3: + Polyunsaturated_fatty_acid = nutrition3.split('多価不飽和脂肪酸')[1].strip().replace('\n', '') + + # 食料 + Ingredients = await page.locator('.p-recipe-ingredient-list__item').all_text_contents() + # for Ingredient in Ingredients: + # print(Ingredient) + Ingredients_clean = [Ingredient.strip().replace('\n', '') for Ingredient in Ingredients] + + print('------作法步驟-----') + + # 作法 + Steps = await page.locator('.p-recipe-step__item').all_text_contents() + # for Step in Steps: + # print(Step) + Steps_clean = [Step.strip().replace('\n', '') for Step in Steps] + + # 定位到所有步骤的图片元素 + image_elements = page.locator('.p-recipe-step__item-image') + + # 获取所有图片元素的 src 属性(即图片的 URL) + image_urls = await image_elements.evaluate_all("elements => elements.map(e => e.getAttribute('src'))") + + # 遍历图片 URL 列表,下载图片并转换为 Base64 + Step_images_base64 = [] + for image_url in image_urls: + # 直接使用图片的 URL 下载图片并转换为 Base64 + image_base64 = await fetch_image_as_base64(page, image_url) + if image_base64: + Step_images_base64.append(image_base64) + + dish_data = { + 'name': dishname_clean, + 'image': Dish_image_base64, + 'likes': Likes_count_clean, + 'tags': ", ".join(tags_clean), + 'indications': ", ".join(Indications_clean), + 'Calories': Calorie, + 'Salt': Salt, + 'Protein': Protein, + 'Total_fat': Fat, + 'Total_Carbohydrate': Carbohydrate, + 'Total_sugar': Sugar, + 'Dietary_fiber': Dietary_fiber, + 'Soluble_fiber': Soluble_fiber, + 'Insoluble_fiber': Insoluble_fiber, + 'K': Potassium, + 'Ca': Calcium, + 'Mg': Magnesium, + 'P': Phosphorous, + 'Fe': Iron, + 'Zn': Zinc, + 'I': Iodine, + 'Cholesterol': Cholesterol, + 'Vitamin_B1': Vitamin_B1, + 'Vitamin_B2': Vitamin_B2, + 'Vitamin_C': Vitamin_C, + 'Vitamin_B6': Vitamin_B6, + 'Vitamin_B12': Vitamin_B12, + 'Folate': Folate, + 'Vitamin_A': Vitamin_A, + 'Vitamin_D': Vitamin_D, + 'Vitamin_K': Vitamin_K, + 'Vitamin_E': Vitamin_E, + 'Saturated_fatty_acid': Saturated_fatty_acid, + 'Monounsaturated_fatty_acid': Monounsaturated_fatty_acid, + 'Polyunsaturated_fatty_acid': Polyunsaturated_fatty_acid, + 'Ingredients': ", ".join(Ingredients_clean), + 'Steps': Steps_clean, + 'Step_images_Base64': Step_images_base64 + } + + await save_to_db(dish_data) + + break + except Exception as e: # 捕获可能发生的异常 + print(f"遇到错误:{e},尝试返回并重试") + await page.go_back() # 返回前一页 + retry_count += 1 # 重试计数器加1 + if retry_count >= 3: + print("重试次数超限,跳过当前链接") + break # 跳出循环,处理下一个链接 + # 使用浏览器的后退功能返回列表页,这样不需要重新加载初始URL + await page.go_back() + + try: + await page.click('span.next a[rel="next"]') + await page.wait_for_load_state('networkidle') + except Error: + # 如果“下一頁”不存在,break + break + # --------------------- + await context.close() + await browser.close() + + +async def main() -> None: + async with async_playwright() as playwright: + await run(playwright) + +# asyncio.run(main()) +# async def simple_test(): +# print("Simple async test") +# +# asyncio.run(simple_test()) \ No newline at end of file diff --git a/RecipeCrawler/__init__.py b/RecipeCrawler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/RecipeCrawler/asgi.py b/RecipeCrawler/asgi.py new file mode 100644 index 0000000..a01e473 --- /dev/null +++ b/RecipeCrawler/asgi.py @@ -0,0 +1,16 @@ +""" +ASGI config for RecipeCrawler project. + +It exposes the ASGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/5.0/howto/deployment/asgi/ +""" + +import os + +from django.core.asgi import get_asgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'RecipeCrawler.settings') + +application = get_asgi_application() diff --git a/RecipeCrawler/settings.py b/RecipeCrawler/settings.py new file mode 100644 index 0000000..5096d19 --- /dev/null +++ b/RecipeCrawler/settings.py @@ -0,0 +1,125 @@ +""" +Django settings for RecipeCrawler project. + +Generated by 'django-admin startproject' using Django 5.0.2. + +For more information on this file, see +https://docs.djangoproject.com/en/5.0/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/5.0/ref/settings/ +""" + +from pathlib import Path + +# Build paths inside the project like this: BASE_DIR / 'subdir'. +BASE_DIR = Path(__file__).resolve().parent.parent + + +# Quick-start development settings - unsuitable for production +# See https://docs.djangoproject.com/en/5.0/howto/deployment/checklist/ + +# SECURITY WARNING: keep the secret key used in production secret! +SECRET_KEY = 'django-insecure-89j!e^jyf!ak#t!2oxzwbk^%fmhljxi%w*epobnrz^k-*&+!wr' + +# SECURITY WARNING: don't run with debug turned on in production! +DEBUG = True + +ALLOWED_HOSTS = [] + + +# Application definition + +INSTALLED_APPS = [ + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', + 'Recipe', +] + +MIDDLEWARE = [ + 'django.middleware.security.SecurityMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.middleware.common.CommonMiddleware', + 'django.middleware.csrf.CsrfViewMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.contrib.messages.middleware.MessageMiddleware', + 'django.middleware.clickjacking.XFrameOptionsMiddleware', +] + +ROOT_URLCONF = 'RecipeCrawler.urls' + +TEMPLATES = [ + { + 'BACKEND': 'django.template.backends.django.DjangoTemplates', + 'DIRS': [BASE_DIR / 'templates'] + , + 'APP_DIRS': True, + 'OPTIONS': { + 'context_processors': [ + 'django.template.context_processors.debug', + 'django.template.context_processors.request', + 'django.contrib.auth.context_processors.auth', + 'django.contrib.messages.context_processors.messages', + ], + }, + }, +] + +WSGI_APPLICATION = 'RecipeCrawler.wsgi.application' + + +# Database +# https://docs.djangoproject.com/en/5.0/ref/settings/#databases + +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.sqlite3', + 'NAME': BASE_DIR / 'recipe_db.sqlite3', + } +} + + +# Password validation +# https://docs.djangoproject.com/en/5.0/ref/settings/#auth-password-validators + +AUTH_PASSWORD_VALIDATORS = [ + { + 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', + }, +] + + +# Internationalization +# https://docs.djangoproject.com/en/5.0/topics/i18n/ + +LANGUAGE_CODE = 'en-us' + +TIME_ZONE = 'UTC' + +USE_I18N = True + +USE_TZ = True + + +# Static files (CSS, JavaScript, Images) +# https://docs.djangoproject.com/en/5.0/howto/static-files/ + +STATIC_URL = 'static/' + +# Default primary key field type +# https://docs.djangoproject.com/en/5.0/ref/settings/#default-auto-field + +DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' diff --git a/RecipeCrawler/urls.py b/RecipeCrawler/urls.py new file mode 100644 index 0000000..aa18681 --- /dev/null +++ b/RecipeCrawler/urls.py @@ -0,0 +1,24 @@ +""" +URL configuration for RecipeCrawler project. + +The `urlpatterns` list routes URLs to views. For more information please see: + https://docs.djangoproject.com/en/5.0/topics/http/urls/ +Examples: +Function views + 1. Add an import: from my_app import views + 2. Add a URL to urlpatterns: path('', views.home, name='home') +Class-based views + 1. Add an import: from other_app.views import Home + 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') +Including another URLconf + 1. Import the include() function: from django.urls import include, path + 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) +""" +from django.contrib import admin +from django.urls import path +from Recipe import views as recipe_views + +urlpatterns = [ + path('start/', recipe_views.RecipeCrawlerView.as_view()), + path('admin/', admin.site.urls), +] diff --git a/RecipeCrawler/wsgi.py b/RecipeCrawler/wsgi.py new file mode 100644 index 0000000..802c9a2 --- /dev/null +++ b/RecipeCrawler/wsgi.py @@ -0,0 +1,16 @@ +""" +WSGI config for RecipeCrawler project. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/5.0/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'RecipeCrawler.settings') + +application = get_wsgi_application() diff --git a/manage.py b/manage.py new file mode 100644 index 0000000..3dfed2c --- /dev/null +++ b/manage.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python +"""Django's command-line utility for administrative tasks.""" +import os +import sys +import django + + +def main(): + """Run administrative tasks.""" + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'RecipeCrawler.settings') + try: + from django.core.management import execute_from_command_line + except ImportError as exc: + raise ImportError( + "Couldn't import Django. Are you sure it's installed and " + "available on your PYTHONPATH environment variable? Did you " + "forget to activate a virtual environment?" + ) from exc + execute_from_command_line(sys.argv) + + +if __name__ == '__main__': + main() diff --git a/recipe.py b/recipe.py new file mode 100644 index 0000000..88afebc --- /dev/null +++ b/recipe.py @@ -0,0 +1,24 @@ +from playwright.sync_api import Playwright, sync_playwright, expect + + +def run(playwright: Playwright) -> None: + browser = playwright.chromium.launch(headless=False) + context = browser.new_context() + page = context.new_page() + page.goto("https://oishi-kenko.com/recipes") + page.get_by_text("ねぎたっぷり 塩牛丼").click() + page.get_by_role("heading", name="ねぎたっぷり 塩牛丼").click() + page.get_by_role("heading", name="ねぎたっぷり 塩牛丼").click(button="right") + expect(page.get_by_role("article")).to_contain_text("ねぎたっぷり 塩牛丼") + page.get_by_role("article").locator("li").filter(has_text="糖尿病").locator("div").click() + expect(page.get_by_role("article")).to_contain_text("糖尿病") + page.get_by_role("link", name="高血圧", exact=True).click() + page.close() + + # --------------------- + context.close() + browser.close() + + +with sync_playwright() as playwright: + run(playwright)