NotesWhat is notes.io?

Notes brand slogan

Notes - notes.io

import os
import time
import openpyxl

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait


# =====================================
# 파일명 중복 방지
# =====================================

def get_unique_filename(base_name):

if not os.path.exists(base_name):
return base_name

name, ext = os.path.splitext(base_name)

num = 1

while True:

new_name = f"{name}_{num}{ext}"

if not os.path.exists(new_name):
return new_name

num += 1


# =====================================
# 상세페이지 정보 추출
# =====================================

def extract_ssg_info(driver):

sale_price = ""
coupon_info = ""
main_product = ""
gift_product = ""

try:

driver.switch_to.default_content()

# ==========================
# 판매가
# ==========================

try:

sale_price = driver.find_element(
By.CSS_SELECTOR,
".cdtl_new_price em.ssg_price"
).text.strip()

except:
pass

# ==========================
# 쿠폰
# ==========================

try:

coupon_info = driver.find_element(
By.CSS_SELECTOR,
".cdtl_benefit_coupon p.txt"
).text.strip()

except:
pass

# ==========================
# 상세설명 iframe 진입
# ==========================

try:

iframe = driver.find_element(
By.ID,
"_ifr_html"
)

driver.switch_to.frame(iframe)

except:

return (
sale_price,
coupon_info,
"",
""
)

# ==========================
# descContents 파싱
# ==========================

try:

text = driver.find_element(
By.ID,
"descContents"
).text

except:

try:

text = driver.find_element(
By.TAG_NAME,
"body"
).text

except:

text = ""

lines = []

for line in text.split("n"):

line = line.strip()

if line:
lines.append(line)

# ==========================
# 본품 / 추가증정 파싱
# ==========================

MAIN_START = [
"[본품]"
]

MAIN_EXTRA = [
"[내장]"
]

GIFT_START = [
"[추가증정]",
"[추가구성]",
"[증정]",
"[사은품]",
"[gift]"
]

END_KEYWORDS = [
"[상품상세]",
"[상세정보]",
"[상품정보]",
"상품정보 제공고시",
"※ 제품 이미지는",
"※ 구성품은",
"※ 사은품은",
"교환 및 반품",
"배송안내"
]

main_items = []
gift_items = []

mode = None

for line in lines:

lower_line = line.lower()

# --------------------------
# 본품 시작
# --------------------------

if any(
key.lower() in lower_line
for key in MAIN_START
):

mode = "main"
continue

# --------------------------
# 내장도 본품으로 처리
# --------------------------

if any(
key.lower() in lower_line
for key in MAIN_EXTRA
):

mode = "main"
continue

# --------------------------
# 추가증정 시작
# --------------------------

if any(
key.lower() in lower_line
for key in GIFT_START
):

mode = "gift"
continue

# --------------------------
# 종료 조건
# --------------------------

if any(
key.lower() in lower_line
for key in END_KEYWORDS
):

mode = None
break

# --------------------------
# 수집
# --------------------------

if mode == "main":

if len(line) > 1:
main_items.append(line)

elif mode == "gift":

if len(line) > 1:
gift_items.append(line)

# ==========================
# 중복 제거
# ==========================

main_items = list(
dict.fromkeys(main_items)
)

gift_items = list(
dict.fromkeys(gift_items)
)

# ==========================
# 최종 문자열
# ==========================

main_product = " + ".join(
main_items
)

gift_product = " + ".join(
gift_items
)

driver.switch_to.default_content()

return (
sale_price,
coupon_info,
main_product,
gift_product
)

except Exception as e:

print("상세 추출 실패:", e)

try:
driver.switch_to.default_content()
except:
pass

return (
sale_price,
coupon_info,
"",
""
)

# =====================================
# URL 입력
# =====================================

url = input(
"목록 페이지 URL 입력 : "
).strip()

if not url.startswith("http"):
url = "https://" + url


# =====================================
# 크롬 실행
# =====================================

options = webdriver.ChromeOptions()

options.add_argument(
"--disable-blink-features=AutomationControlled"
)

options.add_experimental_option(
"excludeSwitches",
["enable-automation"]
)

options.add_experimental_option(
"useAutomationExtension",
False
)

driver = webdriver.Chrome(
service=Service("chromedriver.exe"),
options=options
)

driver.get(url)

print()
print("=" * 60)
print("정렬 / 필터 설정 후 엔터")
print("=" * 60)

input()

time.sleep(2)


# =====================================
# 상품 수집 (범용 버전)
# =====================================

products = []

print()
print("=" * 50)
print("상품명 CSS 선택자 입력")
print("예시")
print(".css-1r5pe50")
print(".mnsditem_goods_tit")
print(".ssgitem_tit_name")
print("=" * 50)

product_selector = input(
"상품명 선택자 : "
).strip()

if not product_selector:
product_selector = ".css-1r5pe50"

links = driver.find_elements(
By.TAG_NAME,
"a"
)

for link in links:

try:

href = link.get_attribute(
"href"
)

if not href:
continue

if href.startswith(
"javascript"
):
continue

names = link.find_elements(
By.CSS_SELECTOR,
product_selector
)

if not names:
continue

product_name = (
names[0]
.text
.strip()
)

if not product_name:
continue

products.append(
(
href,
product_name
)
)

except:
pass

# =====================================
# 추가 수집 (상품명 자체가 a 밖에 있는 경우)
# =====================================

try:

name_elements = driver.find_elements(
By.CSS_SELECTOR,
product_selector
)

for elem in name_elements:

try:

product_name = (
elem.text
.strip()
)

if not product_name:
continue

parent = elem

href = ""

for _ in range(10):

try:

href = parent.get_attribute(
"href"
)

if href:
break

except:
pass

try:

parent = parent.find_element(
By.XPATH,
".."
)

except:
break

if not href:
continue

if href.startswith(
"javascript"
):
continue

products.append(
(
href,
product_name
)
)

except:
pass

except:
pass

# =====================================
# 중복 제거
# =====================================

seen = set()

unique_products = []

for href, name in products:

key = (
href.strip(),
name.strip()
)

if key in seen:
continue

seen.add(key)

unique_products.append(
(
href,
name
)
)

print()
print(
f"총 {len(unique_products)}개 발견"
)
print()
# =====================================
# 상세 수집
# =====================================

results = []

for idx, (
product_url,
product_name
) in enumerate(
unique_products,
start=1
):

try:

print(
f"[{idx}/{len(unique_products)}] "
f"{product_name}"
)

driver.execute_script(
"window.open(arguments[0]);",
product_url
)

driver.switch_to.window(
driver.window_handles[-1]
)

# 봇 방지용
time.sleep(5)

WebDriverWait(
driver,
20
).until(
lambda d:
d.execute_script(
"return document.readyState"
) == "complete"
)

sale_price = ""
coupon_info = ""
main_product = ""
gift_product = ""

for _ in range(10):

(
sale_price,
coupon_info,
main_product,
gift_product
) = extract_ssg_info(
driver
)

if (
sale_price
or coupon_info
or main_product
or gift_product
):
break

time.sleep(1)

print("판매가 :", sale_price)

if coupon_info:
print("쿠폰 :", coupon_info[:60])

if main_product:
print("본품 :", main_product[:60])

if gift_product:
print("추가증정 :", gift_product[:60])

results.append(
[
product_url,
product_name,
sale_price,
coupon_info,
main_product,
gift_product
]
)

driver.close()

driver.switch_to.window(
driver.window_handles[0]
)

except Exception as e:

print(
"오류:",
e
)

try:
driver.close()
except:
pass

try:
driver.switch_to.window(
driver.window_handles[0]
)
except:
pass


# =====================================
# 엑셀 저장
# =====================================

wb = openpyxl.Workbook()

ws = wb.active

ws.title = "Products"

headers = [
"URL",
"상품명",
"판매가",
"쿠폰정보",
"본품",
"추가증정"
]

for col, header in enumerate(
headers,
start=1
):

ws.cell(
row=1,
column=col,
value=header
)

for row, item in enumerate(
results,
start=2
):

for col, value in enumerate(
item,
start=1
):

ws.cell(
row=row,
column=col,
value=value
)

ws.column_dimensions["A"].width = 90
ws.column_dimensions["B"].width = 50
ws.column_dimensions["C"].width = 15
ws.column_dimensions["D"].width = 80
ws.column_dimensions["E"].width = 80
ws.column_dimensions["F"].width = 120

filename = get_unique_filename(
"products.xlsx"
)

wb.save(filename)

print()
print("=" * 60)
print("저장 완료")
print(filename)
print("=" * 60)

driver.quit()

input(
"n엔터 누르면 종료..."
)
     
 
what is notes.io
 

Notes is a web-based application for online taking notes. You can take your notes and share with others people. If you like taking long notes, notes.io is designed for you. To date, over 8,000,000,000+ notes created and continuing...

With notes.io;

  • * You can take a note from anywhere and any device with internet connection.
  • * You can share the notes in social platforms (YouTube, Facebook, Twitter, instagram etc.).
  • * You can quickly share your contents without website, blog and e-mail.
  • * You don't need to create any Account to share a note. As you wish you can use quick, easy and best shortened notes with sms, websites, e-mail, or messaging services (WhatsApp, iMessage, Telegram, Signal).
  • * Notes.io has fabulous infrastructure design for a short link and allows you to share the note as an easy and understandable link.

Fast: Notes.io is built for speed and performance. You can take a notes quickly and browse your archive.

Easy: Notes.io doesn’t require installation. Just write and share note!

Short: Notes.io’s url just 8 character. You’ll get shorten link of your note when you want to share. (Ex: notes.io/q )

Free: Notes.io works for 14 years and has been free since the day it was started.


You immediately create your first note and start sharing with the ones you wish. If you want to contact us, you can use the following communication channels;


Email: [email protected]

Twitter: http://twitter.com/notesio

Instagram: http://instagram.com/notes.io

Facebook: http://facebook.com/notesio



Regards;
Notes.io Team

     
 
Shortened Note Link
 
 
Looding Image
 
     
 
Long File
 
 

For written notes was greater than 18KB Unable to shorten.

To be smaller than 18KB, please organize your notes, or sign in.