728x90
반응형
데이터 과학 기반의 파이썬 빅데이터 분석
Chapter 05 파이썬 크롤링 - API 이용
01 네이버 API를 이용한 크롤링 실습
본문에 나온 코드(p138~140)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
|
import os
import sys
import urllib.request
import datetime
import time
import json
client_id = "as2X5XDSBGGmznFdHdwQ"
client_secret = "Ex0cYZ1Wkt"
#code1
def getRequestUrl(url):
req = urllib.request.Request(url)
req.add_header("X-Naver-Client-Id", client_id)
req.add_header("X-Naver-Client-Secret", client_secret)
try:
response = urllib.request.urlopen(req)
if response.getcode() == 200:
print("[%s] Url Request Success" % datetime.datetime.now())
return response.read().decode('utf-8')
except Exception as e:
print(e)
print("[%s] Error for URL : %s" % (datetime.datetime.now(), url))
return None
#code2
def getNaverSearch(node, srcText, start, display):
base = "https://openapi.naver.com/v1/search"
node = "/%s.json" % node
parameters = "?query = %s&start = %s&display = %s" % (urllib.parse.quote(srcText), start, display)
url = base + node + parameters
responseDecode = getRequestUrl(url)
if (responseDecode == None):
return None
else:
return json.loads(responseDecode)
#code3
def getPostData(post, jsonResult, cnt):
title = post['title']
description = post['description']
org_link = post['originallink']
link = post['link']
pDate = datetime.datetime.strptime(post['pubDate'], '%a, %d %b %Y %H:%M:%S+0900')
pDate = pDate.strftime('%Y-%m-%d %H:%M:%S')
jsonResult.append({'cnt':cnt, 'title':title, 'description':description, 'org_link':org_link, 'link':org_link, 'pDate':pDate})
return
#code0
def main():
node = 'news'
srcText = input('검색어를 입력하세요 : ')
cnt = 0
jsonResult = []
jsonResponse = getNaverSearch(node, srcText, 1, 100)
total = jsonResponse['total']
while ((jsonResponse != None) and (jsonResponse['display'] != 0)):
for post in jsonResponse['items']:
cnt += 1
getPostData(post, jsonResult, cnt)
start = jsonResponse['start'] + jsonResponse['display']
jsonResponse = getNaverSearch(node, srcText, start, 100)
print('전체 검색 : %d 건' %total)
with open('%s_naver_%s.json' % (srcText, node), 'w', encoding='utf8') as outfile:
jsonFile = json.dumps(jsonResult, indent=4, sort_keys = True, ensure_ascii = False)
outfile.write(jsonFile)
print("가져온 데이터 : %d 건" %(cnt))
print('%s_naver_%s.json SAVED' % (srcText, node))
if __name__ == '__main__':
main()
|
cs |
하지만 아래와 같은 오류 코드가 나왔습니다.
1
2
3
4
5
6
7
8
9
10
|
검색어를 입력하세요 : 월드컵
URL can't contain control characters. '/v1/search/news.json?query = %EC%9B%94%EB%93%9C%EC%BB%B5&start = 1&display = 100' (found at least ' ')
[2021-03-28 21:01:44.281394] Error for URL : https://openapi.naver.com/v1/search/news.json?query = %EC%9B%94%EB%93%9C%EC%BB%B5&start = 1&display = 100
Traceback (most recent call last):
File "C:/My_Python/nvCrawler.py", line 81, in <module>
main()
File "C:/My_Python/nvCrawler.py", line 60, in main
total = jsonResponse['total']
TypeError: 'NoneType' object is not subscriptable
>>>
|
cs |
그래서 ericnjennifer.github.io/python_crawling/2018/01/21/PythonCrawling_Chapt9.html 를 참고하여 수정 후 해결했습니다.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
import os
import sys
import urllib.request
import datetime
import time
import json
from config import *
client_id = "as2X5XDSBGGmznFdHdwQ"
client_secret = "Ex0cYZ1Wkt"
#[CODE 1]
def get_request_url(url):
req = urllib.request.Request(url)
req.add_header("X-Naver-Client-Id", client_id)
req.add_header("X-Naver-Client-Secret", client_secret)
try:
response = urllib.request.urlopen(req)
if response.getcode() == 200:
print ("[%s] Url Request Success" % datetime.datetime.now())
return response.read().decode('utf-8')
except Exception as e:
print(e)
print("[%s] Error for URL : %s" % (datetime.datetime.now(), url))
return None
#[CODE 2]
def getNaverSearchResult(sNode, search_text, page_start, display):
base = "https://openapi.naver.com/v1/search"
node = "/%s.json" % sNode
parameters = "?query=%s&start=%s&display=%s" % (urllib.parse.quote(search_text), page_start, display)
url = base + node + parameters
retData = get_request_url(url)
if (retData == None):
return None
else:
return json.loads(retData)
#[CODE 3]
def getPostData(post, jsonResult, cnt):
title = post['title']
description = post['description']
org_link = post['originallink']
link = post['link']
#Tue, 14 Feb 2017 18:46:00 +0900
pDate = datetime.datetime.strptime(post['pubDate'], '%a, %d %b %Y %H:%M:%S +0900')
pDate = pDate.strftime('%Y-%m-%d %H:%M:%S')
jsonResult.append({'cnt':cnt, 'title':title, 'description': description,
'org_link':org_link, 'link': org_link,
'pDate':pDate})
return
def main():
jsonResult = []
# 'news', 'blog', 'cafearticle'
sNode = 'news'
search_text = input('검색어를 입력하세요 : ')
cnt = 0
display_count = 100
jsonSearch = getNaverSearchResult(sNode, search_text, 1, display_count)
total = jsonSearch['total']
while ((jsonSearch != None) and (jsonSearch['display'] != 0)):
for post in jsonSearch['items']:
cnt += 1
getPostData(post, jsonResult, cnt)
nStart = jsonSearch['start'] + jsonSearch['display']
jsonSearch = getNaverSearchResult(sNode, search_text, nStart, display_count)
with open('%s_naver_%s.json' % (search_text, sNode), 'w', encoding='utf8') as outfile:
retJson = json.dumps(jsonResult,
indent=4, sort_keys=True,
ensure_ascii=False)
outfile.write(retJson)
print('전체 검색 : %d 건' %total)
print("가져온 데이터 : %d 건" %(cnt))
print ('%s_naver_%s.json SAVED' % (search_text, sNode))
if __name__ == '__main__':
main()
|
cs |
아래와 같이 실행이 잘 되었습니다.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
검색어를 입력하세요 : 자전거
[2021-03-28 21:17:30.828224] Url Request Success
[2021-03-28 21:17:31.107964] Url Request Success
[2021-03-28 21:17:31.363961] Url Request Success
[2021-03-28 21:17:31.571144] Url Request Success
[2021-03-28 21:17:31.850184] Url Request Success
[2021-03-28 21:17:32.098026] Url Request Success
[2021-03-28 21:17:32.353766] Url Request Success
[2021-03-28 21:17:32.594701] Url Request Success
[2021-03-28 21:17:32.838343] Url Request Success
[2021-03-28 21:17:33.074711] Url Request Success
HTTP Error 400: Bad Request
[2021-03-28 21:17:33.206152] Error for URL : https://openapi.naver.com/v1/search/news.json?query=%EC%9E%90%EC%A0%84%EA%B1%B0&start=1001&display=100
전체 검색 : 726532 건
가져온 데이터 : 1000 건
자전거_naver_news.json SAVED
|
cs |
JSON 파일로 잘 저장된 것까지 확인했습니다.
728x90
반응형
'Language > Python' 카테고리의 다른 글
[Python] for in 반복문, range, enumerate (0) | 2022.01.27 |
---|---|
[Python] sys.stdin.readline() (0) | 2022.01.08 |
예제) 사각형 넓이 구하기 (0) | 2021.04.02 |
댓글