Notes
Notes - notes.io |
import time
import openpyxl
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
# =====================================
# 파일명 중복 방지
# =====================================
def get_unique_filename(base_name):
if not os.path.exists(base_name):
return base_name
name, ext = os.path.splitext(base_name)
num = 1
while True:
new_name = f"{name}_{num}{ext}"
if not os.path.exists(new_name):
return new_name
num += 1
# =====================================
# 상세페이지 정보 추출
# =====================================
def extract_ssg_info(driver):
sale_price = ""
coupon_info = ""
main_product = ""
gift_product = ""
try:
driver.switch_to.default_content()
# ==========================
# 판매가
# ==========================
try:
sale_price = driver.find_element(
By.CSS_SELECTOR,
".cdtl_new_price em.ssg_price"
).text.strip()
except:
pass
# ==========================
# 쿠폰
# ==========================
try:
coupon_info = driver.find_element(
By.CSS_SELECTOR,
".cdtl_benefit_coupon p.txt"
).text.strip()
except:
pass
# ==========================
# 상세설명 iframe 진입
# ==========================
try:
iframe = driver.find_element(
By.ID,
"_ifr_html"
)
driver.switch_to.frame(iframe)
except:
return (
sale_price,
coupon_info,
"",
""
)
# ==========================
# descContents 파싱
# ==========================
try:
text = driver.find_element(
By.ID,
"descContents"
).text
except:
try:
text = driver.find_element(
By.TAG_NAME,
"body"
).text
except:
text = ""
lines = []
for line in text.split("n"):
line = line.strip()
if line:
lines.append(line)
# ==========================
# 본품 / 추가증정 파싱
# ==========================
MAIN_START = [
"[본품]"
]
MAIN_EXTRA = [
"[내장]"
]
GIFT_START = [
"[추가증정]",
"[추가구성]",
"[증정]",
"[사은품]",
"[gift]"
]
END_KEYWORDS = [
"[상품상세]",
"[상세정보]",
"[상품정보]",
"상품정보 제공고시",
"※ 제품 이미지는",
"※ 구성품은",
"※ 사은품은",
"교환 및 반품",
"배송안내"
]
main_items = []
gift_items = []
mode = None
for line in lines:
lower_line = line.lower()
# --------------------------
# 본품 시작
# --------------------------
if any(
key.lower() in lower_line
for key in MAIN_START
):
mode = "main"
continue
# --------------------------
# 내장도 본품으로 처리
# --------------------------
if any(
key.lower() in lower_line
for key in MAIN_EXTRA
):
mode = "main"
continue
# --------------------------
# 추가증정 시작
# --------------------------
if any(
key.lower() in lower_line
for key in GIFT_START
):
mode = "gift"
continue
# --------------------------
# 종료 조건
# --------------------------
if any(
key.lower() in lower_line
for key in END_KEYWORDS
):
mode = None
break
# --------------------------
# 수집
# --------------------------
if mode == "main":
if len(line) > 1:
main_items.append(line)
elif mode == "gift":
if len(line) > 1:
gift_items.append(line)
# ==========================
# 중복 제거
# ==========================
main_items = list(
dict.fromkeys(main_items)
)
gift_items = list(
dict.fromkeys(gift_items)
)
# ==========================
# 최종 문자열
# ==========================
main_product = " + ".join(
main_items
)
gift_product = " + ".join(
gift_items
)
driver.switch_to.default_content()
return (
sale_price,
coupon_info,
main_product,
gift_product
)
except Exception as e:
print("상세 추출 실패:", e)
try:
driver.switch_to.default_content()
except:
pass
return (
sale_price,
coupon_info,
"",
""
)
# =====================================
# URL 입력
# =====================================
url = input(
"목록 페이지 URL 입력 : "
).strip()
if not url.startswith("http"):
url = "https://" + url
# =====================================
# 크롬 실행
# =====================================
options = webdriver.ChromeOptions()
options.add_argument(
"--disable-blink-features=AutomationControlled"
)
options.add_experimental_option(
"excludeSwitches",
["enable-automation"]
)
options.add_experimental_option(
"useAutomationExtension",
False
)
driver = webdriver.Chrome(
service=Service("chromedriver.exe"),
options=options
)
driver.get(url)
print()
print("=" * 60)
print("정렬 / 필터 설정 후 엔터")
print("=" * 60)
input()
time.sleep(2)
# =====================================
# 상품 수집 (범용 버전)
# =====================================
products = []
print()
print("=" * 50)
print("상품명 CSS 선택자 입력")
print("예시")
print(".css-1r5pe50")
print(".mnsditem_goods_tit")
print(".ssgitem_tit_name")
print("=" * 50)
product_selector = input(
"상품명 선택자 : "
).strip()
if not product_selector:
product_selector = ".css-1r5pe50"
links = driver.find_elements(
By.TAG_NAME,
"a"
)
for link in links:
try:
href = link.get_attribute(
"href"
)
if not href:
continue
if href.startswith(
"javascript"
):
continue
names = link.find_elements(
By.CSS_SELECTOR,
product_selector
)
if not names:
continue
product_name = (
names[0]
.text
.strip()
)
if not product_name:
continue
products.append(
(
href,
product_name
)
)
except:
pass
# =====================================
# 추가 수집 (상품명 자체가 a 밖에 있는 경우)
# =====================================
try:
name_elements = driver.find_elements(
By.CSS_SELECTOR,
product_selector
)
for elem in name_elements:
try:
product_name = (
elem.text
.strip()
)
if not product_name:
continue
parent = elem
href = ""
for _ in range(10):
try:
href = parent.get_attribute(
"href"
)
if href:
break
except:
pass
try:
parent = parent.find_element(
By.XPATH,
".."
)
except:
break
if not href:
continue
if href.startswith(
"javascript"
):
continue
products.append(
(
href,
product_name
)
)
except:
pass
except:
pass
# =====================================
# 중복 제거
# =====================================
seen = set()
unique_products = []
for href, name in products:
key = (
href.strip(),
name.strip()
)
if key in seen:
continue
seen.add(key)
unique_products.append(
(
href,
name
)
)
print()
print(
f"총 {len(unique_products)}개 발견"
)
print()
# =====================================
# 상세 수집
# =====================================
results = []
for idx, (
product_url,
product_name
) in enumerate(
unique_products,
start=1
):
try:
print(
f"[{idx}/{len(unique_products)}] "
f"{product_name}"
)
driver.execute_script(
"window.open(arguments[0]);",
product_url
)
driver.switch_to.window(
driver.window_handles[-1]
)
# 봇 방지용
time.sleep(5)
WebDriverWait(
driver,
20
).until(
lambda d:
d.execute_script(
"return document.readyState"
) == "complete"
)
sale_price = ""
coupon_info = ""
main_product = ""
gift_product = ""
for _ in range(10):
(
sale_price,
coupon_info,
main_product,
gift_product
) = extract_ssg_info(
driver
)
if (
sale_price
or coupon_info
or main_product
or gift_product
):
break
time.sleep(1)
print("판매가 :", sale_price)
if coupon_info:
print("쿠폰 :", coupon_info[:60])
if main_product:
print("본품 :", main_product[:60])
if gift_product:
print("추가증정 :", gift_product[:60])
results.append(
[
product_url,
product_name,
sale_price,
coupon_info,
main_product,
gift_product
]
)
driver.close()
driver.switch_to.window(
driver.window_handles[0]
)
except Exception as e:
print(
"오류:",
e
)
try:
driver.close()
except:
pass
try:
driver.switch_to.window(
driver.window_handles[0]
)
except:
pass
# =====================================
# 엑셀 저장
# =====================================
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "Products"
headers = [
"URL",
"상품명",
"판매가",
"쿠폰정보",
"본품",
"추가증정"
]
for col, header in enumerate(
headers,
start=1
):
ws.cell(
row=1,
column=col,
value=header
)
for row, item in enumerate(
results,
start=2
):
for col, value in enumerate(
item,
start=1
):
ws.cell(
row=row,
column=col,
value=value
)
ws.column_dimensions["A"].width = 90
ws.column_dimensions["B"].width = 50
ws.column_dimensions["C"].width = 15
ws.column_dimensions["D"].width = 80
ws.column_dimensions["E"].width = 80
ws.column_dimensions["F"].width = 120
filename = get_unique_filename(
"products.xlsx"
)
wb.save(filename)
print()
print("=" * 60)
print("저장 완료")
print(filename)
print("=" * 60)
driver.quit()
input(
"n엔터 누르면 종료..."
)
![]() |
Notes is a web-based application for online taking notes. You can take your notes and share with others people. If you like taking long notes, notes.io is designed for you. To date, over 8,000,000,000+ notes created and continuing...
With notes.io;
- * You can take a note from anywhere and any device with internet connection.
- * You can share the notes in social platforms (YouTube, Facebook, Twitter, instagram etc.).
- * You can quickly share your contents without website, blog and e-mail.
- * You don't need to create any Account to share a note. As you wish you can use quick, easy and best shortened notes with sms, websites, e-mail, or messaging services (WhatsApp, iMessage, Telegram, Signal).
- * Notes.io has fabulous infrastructure design for a short link and allows you to share the note as an easy and understandable link.
Fast: Notes.io is built for speed and performance. You can take a notes quickly and browse your archive.
Easy: Notes.io doesn’t require installation. Just write and share note!
Short: Notes.io’s url just 8 character. You’ll get shorten link of your note when you want to share. (Ex: notes.io/q )
Free: Notes.io works for 14 years and has been free since the day it was started.
You immediately create your first note and start sharing with the ones you wish. If you want to contact us, you can use the following communication channels;
Email: [email protected]
Twitter: http://twitter.com/notesio
Instagram: http://instagram.com/notes.io
Facebook: http://facebook.com/notesio
Regards;
Notes.io Team
