# Maker : L.T.
# Date : 2019.02.20.
# Description : 네이버 뉴스에서 특정 Keyword 검색 → 언론사, 기사 제목, 날짜, URL 추출하기
# import : requests, bs4
# How to in PyCharm : [File] - [Settings] - [Project: (projectname)] - [Project Interpreter] - "+"
### 모듈 import
import requests
from bs4 import BeautifulSoup
from time import localtime, strftime
import time
from datetime import date
### 날짜 설정
# date_today
date_today = date.today()
date_today_year = str(date_today.year)
date_today_month = str(date_today.month)
date_today_day = str(date_today.day)
date_today = str(date_today)
# date_yesterday
date_yesterday = date.fromtimestamp(time.time() - 60 * 60 * 24)
date_yesterday_year = str(date_yesterday.year)
date_yesterday_month = str(date_yesterday.month)
date_yesterday_day = str(date_yesterday.day)
date_yesterday = str(date_yesterday)
### URL - 네이버 뉴스 검색 결과 (RSS)
newsurl = [
"http://newssearch.naver.com/search.naver?where=rss&query=%EC%A0%9C3%EC%9D%B8%ED%84%B0%EB%84%B7%EC%A0%84%EB%AC%B8%EC%9D%80%ED%96%89&field=0&nx_search_query=&nx_and_query=&nx_sub_query=&nx_search_hlquery=",
"http://newssearch.naver.com/search.naver?where=rss&query=%EC%95%84%EB%A7%88%EC%A1%B4&field=0&nx_search_query=&nx_and_query=&nx_sub_query=&nx_search_hlquery="
]
### Agent 환경 설정
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-US,en;q=0.5",
}
### 출력 파일의 위치, 이름 설정
dir_path = ""
filetime = strftime("%Y-%m-%d_%H-%M-%S", localtime())
filename = "NEWS_" + filetime
savefile = open(dir_path + filename, mode="w", encoding="utf-8")
print("* * * * * * * * * * * * * * * * * ", filename, " * * * * * * * * * * * * * * * * *\n")
### 스크랩
print("☆ NEWS")
savefile.write("☆ NEWS\n")
news_rep_arr = []
news_rep_cnt = 0
newscnt = 0
try:
while newscnt >= 0 and newscnt < len(newsurl):
# send url + header
r = requests.get(newsurl[newscnt], headers=headers)
if r.status_code != 200:
continue
r.encoding = "utf-8"
if newscnt == 0:
news_title = " [NAVER news - 제3 인터넷 전문 은행]"
elif newscnt == 1:
news_title = " [NAVER news - 아마존]"
else:
break
print(news_title, end="")
savefile.write("\n" + news_title + "\n")
# analysis
soup = BeautifulSoup(r.text, "html.parser")
### 언론사, 기사 제목, 날짜, URL 추출하기
# separate tag
contenthtml = soup.find_all("item")
incnt = 0
totalcnt = 0
while incnt < contenthtml.__len__():
contentstr = str(contenthtml.__getitem__(incnt))
contentstr = str(contentstr).replace("link/", "a")
contentstr = str(contentstr).replace(" <description>", "</a> <description>")
contentstr = BeautifulSoup(contentstr, "html.parser")
incnt += 1
### 언론사 추출
authorstr = contentstr.find_all("author")
author = (authorstr.__getitem__(0)).get_text()
author = "[" + author + "]"
### 기사 제목 추출
titlestr = contentstr.find_all("title")
title = (titlestr.__getitem__(0)).get_text()
title = str(title).replace(""", "\"")
title = str(title).replace("'", "\'")
### 날짜 추출 및 형식 변경
datestr = contentstr.find_all("pubdate")
date = (datestr.__getitem__(0)).get_text()
date = str(date).replace(",", "")
date = date.replace("Jan", "01")
date = date.replace("Feb", "02")
date = date.replace("Mar", "03")
date = date.replace("Apr", "04")
date = date.replace("May", "05")
date = date.replace("Jun", "06")
date = date.replace("Jul", "07")
date = date.replace("Aug", "08")
date = date.replace("Sep", "09")
date = date.replace("Oct", "10")
date = date.replace("Nov", "11")
date = date.replace("Dec", "12")
date_split = date.split(" ")
date_temp = date_split[3], date_split[2], date_split[1]
date = "-".join(date_temp)
date = date + " " + date_split[4]
if not (date_today in date) and not (date_yesterday in date) and (date != ""):
continue
### URL 추출
urlstr = contentstr.find_all("a")
url = (urlstr.__getitem__(0)).get_text()
### 추출한 데이터 재가공
temp = " ", author, title, "\t|", date, "\n\t\t\t\t", url
output = " ".join(temp)
### 추출한 데이터 → 파일에 쓰기(write)
for i in range(0, news_rep_cnt + 1):
if news_rep_cnt == 0:
news_rep_arr.append(output)
savefile.write(output + "\n")
news_rep_cnt += 1
totalcnt += 1
elif news_rep_arr[i] == output:
break
elif (news_rep_arr[i] != output) and (i + 1 == news_rep_cnt):
news_rep_arr.append(output)
savefile.write(output + "\n")
news_rep_cnt += 1
totalcnt += 1
break
print(" : ", totalcnt)
newscnt += 1
except:
print("ERROR")
print(("\n**************************************************************************************************"))
savefile.close()