JSON¶

In [329]:

d = {"name": "혼자 공부하는 데이터 분석"}
print(d['name'])

혼자 공부하는 데이터 분석

In [331]:

import json

In [333]:

d_str = json.dumps(d, ensure_ascii=False)
print(d_str)

{"name": "혼자 공부하는 데이터 분석"}

In [335]:

print(type(d_str))

<class 'str'>

In [337]:

d2 = json.loads(d_str)
print(d2['name'])

혼자 공부하는 데이터 분석

In [339]:

print(type(d2))

<class 'dict'>

In [341]:

d3 = json.loads('{"name": "혼자 공부하는 데이터 분석", "author": "박해선", "year": 2022}')
print(d3['name'])
print(d3['author'])
print(d3['year'])

혼자 공부하는 데이터 분석
박해선
2022

In [343]:

d3 = json.loads('{"name": "혼자 공부하는 데이터 분석", "author": ["박해선", "홍길동"], "year": 2022}')
print(d3['author'][1])

홍길동

In [345]:

d4_str = """
[
    {"name": "혼자 공부하는 데이터 분석", "author": "박해선", "year": 2022},
    {"name": "혼자 공부하는 머신러닝+딥러닝", "author": "박해선", "year": 2020}
]
"""
d4 = json.loads(d4_str)
print(d4[1]['name'])

혼자 공부하는 머신러닝+딥러닝

In [347]:

import pandas as pd
pd.read_json(d4_str)

/var/folders/3w/0y55k0y53pg1dvg3p_cqn81m0000gn/T/ipykernel_45399/1308304786.py:2: FutureWarning: Passing literal json to 'read_json' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.
  pd.read_json(d4_str)

Out[347]:

	name	author	year
0	혼자 공부하는 데이터 분석	박해선	2022
1	혼자 공부하는 머신러닝+딥러닝	박해선	2020

In [349]:

pd.DataFrame(d4)

Out[349]:

	name	author	year
0	혼자 공부하는 데이터 분석	박해선	2022
1	혼자 공부하는 머신러닝+딥러닝	박해선	2020

XML¶

In [352]:

x_str = """
<book>
    <name>혼자 공부하는 데이터 분석</name>
    <author>박해선</author>
    <year>2022</year>
</book>
"""

In [354]:

import xml.etree.ElementTree as et
book = et.fromstring(x_str)

In [356]:

print(type(book))

<class 'xml.etree.ElementTree.Element'>

In [358]:

print(book.tag)

book

In [360]:

book_childs = list(book)
print(book_childs)

[<Element 'name' at 0x14cc44310>, <Element 'author' at 0x14cc44f90>, <Element 'year' at 0x14cc456c0>]

In [362]:

name, author, year = book_childs
print(name.text)
print(author.text)
print(year.text)

혼자 공부하는 데이터 분석
박해선
2022

In [364]:

name = book.findtext('name')
author = book.findtext('author')
year = book.findtext('year')
print(name)
print(author)
print(year)

혼자 공부하는 데이터 분석
박해선
2022

In [366]:

x2_str = """
<books>
    <book>
        <name>혼자 공부하는 데이터 분석</name>
        <author>박해선</author>
        <year>2022</year>
    </book>
    <book>
        <name>혼자 공부하는 머신러닝+딥러닝</name>
        <author>박해선</author>
        <year>2020</year>
    </book>
</books>
"""

In [368]:

books = et.fromstring(x2_str)
print(books.tag)

books

In [370]:

for book in books.findall('book'):
    name = book.findtext('name')
    author = book.findtext('author')
    year = book.findtext('year')
    print(name)
    print(author)
    print(year)
    print()

혼자 공부하는 데이터 분석
박해선
2022

혼자 공부하는 머신러닝+딥러닝
박해선
2020

In [372]:

pd.read_xml(x2_str)

/var/folders/3w/0y55k0y53pg1dvg3p_cqn81m0000gn/T/ipykernel_45399/212699965.py:1: FutureWarning: Passing literal xml to 'read_xml' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.
  pd.read_xml(x2_str)

Out[372]:

	name	author	year
0	혼자 공부하는 데이터 분석	박해선	2022
1	혼자 공부하는 머신러닝+딥러닝	박해선	2020

In [374]:

import requests

In [376]:

url = "http://data4library.kr/api/loanItemSrch?authKey=bfc41484459a5d9a46fde73d6fccedc237fa137085a62954ed1287e50fd6130b&format=json&startDt=2021-04-01&endDt=2021-04-30&age=20"

In [378]:

r = requests.get(url)

In [379]:

data = r.json()
print(data)

{'response': {'error': 'API 활성화 상태가아닙니다.'}}

In [382]:

r.status_code

Out[382]:

HTML¶

In [386]:

import pandas as pd
books_df = pd.read_json('20s_best_book.json')
books_df.head()

Out[386]:

	no	ranking	bookname	authors	publisher	publication_year	isbn13	addition_symbol	class_no	class_nm	loan_count	bookImageURL
0	1	1	우리가 빛의 속도로 갈 수 없다면 :김초엽 소설	지은이: 김초엽	허블	2019	9791190090018	03810	813.7	문학 > 한국문학 > 소설	461	https://image.aladin.co.kr/product/19359/16/co...
1	2	2	달러구트 꿈 백화점.이미예 장편소설	지은이: 이미예	팩토리나인	2020	9791165341909	03810	813.7	문학 > 한국문학 > 소설	387	https://image.aladin.co.kr/product/24512/70/co...
2	3	3	지구에서 한아뿐 :정세랑 장편소설	지은이: 정세랑	난다	2019	9791188862290	03810	813.7	문학 > 한국문학 > 소설	383	https://image.aladin.co.kr/product/19804/82/co...
3	4	4	시선으로부터, :정세랑 장편소설	지은이: 정세랑	문학동네	2020	9788954672214	03810	813.7	문학 > 한국문학 > 소설	370	https://image.aladin.co.kr/product/24131/37/co...
4	5	5	아몬드 :손원평 장편소설	지은이: 손원평	창비	2017	9788936434267	03810	813.7	문학 > 한국문학 > 소설	365	http://image.aladin.co.kr/product/16839/4/cove...

In [388]:

books = books_df[['no','ranking','bookname','authors','publisher','publication_year','isbn13']]
books.head()

Out[388]:

	no	ranking	bookname	authors	publisher	publication_year	isbn13
0	1	1	우리가 빛의 속도로 갈 수 없다면 :김초엽 소설	지은이: 김초엽	허블	2019	9791190090018
1	2	2	달러구트 꿈 백화점.이미예 장편소설	지은이: 이미예	팩토리나인	2020	9791165341909
2	3	3	지구에서 한아뿐 :정세랑 장편소설	지은이: 정세랑	난다	2019	9791188862290
3	4	4	시선으로부터, :정세랑 장편소설	지은이: 정세랑	문학동네	2020	9788954672214
4	5	5	아몬드 :손원평 장편소설	지은이: 손원평	창비	2017	9788936434267

In [390]:

books_df.loc[[0,1],['bookname','authors']]

Out[390]:

	bookname	authors
0	우리가 빛의 속도로 갈 수 없다면 :김초엽 소설	지은이: 김초엽
1	달러구트 꿈 백화점.이미예 장편소설	지은이: 이미예

In [392]:

books_df.loc[0:1, 'bookname':'authors']

Out[392]:

	bookname	authors
0	우리가 빛의 속도로 갈 수 없다면 :김초엽 소설	지은이: 김초엽
1	달러구트 꿈 백화점.이미예 장편소설	지은이: 이미예

In [394]:

books = books_df.loc[:, 'no':'isbn13']
books.head()

Out[394]:

	no	ranking	bookname	authors	publisher	publication_year	isbn13
0	1	1	우리가 빛의 속도로 갈 수 없다면 :김초엽 소설	지은이: 김초엽	허블	2019	9791190090018
1	2	2	달러구트 꿈 백화점.이미예 장편소설	지은이: 이미예	팩토리나인	2020	9791165341909
2	3	3	지구에서 한아뿐 :정세랑 장편소설	지은이: 정세랑	난다	2019	9791188862290
3	4	4	시선으로부터, :정세랑 장편소설	지은이: 정세랑	문학동네	2020	9788954672214
4	5	5	아몬드 :손원평 장편소설	지은이: 손원평	창비	2017	9788936434267

In [396]:

books_df.loc[::2, 'no':'isbn13'].head()

Out[396]:

	no	ranking	bookname	authors	publisher	publication_year	isbn13
0	1	1	우리가 빛의 속도로 갈 수 없다면 :김초엽 소설	지은이: 김초엽	허블	2019	9791190090018
2	3	3	지구에서 한아뿐 :정세랑 장편소설	지은이: 정세랑	난다	2019	9791188862290
4	5	5	아몬드 :손원평 장편소설	지은이: 손원평	창비	2017	9788936434267
6	7	7	목소리를 드릴게요 :정세랑 소설집	지은이: 정세랑	아작	2020	9791165300005
8	9	9	선량한 차별주의자	김지혜 지음	창비	2019	9788936477196

In [400]:

from bs4 import BeautifulSoup

In [402]:

soup = BeautifulSoup(r.text, 'html.parser')

In [404]:

prd_link = soup.find('a', attrs={'class':'gd_name'})
print(prd_link)

<a class="gd_name" href="/Product/Goods/74261416" onclick="wiseLogV2('S', '101_005_003_001', ''); setGoodsClickExtraCodeHub('032', '9791190090018', '74261416', '0',this);">[예스리커버] 우리가 빛의 속도로 갈 수 없다면</a>

In [407]:

soup = BeautifulSoup(r.text, 'html.parser')
prd_detail = soup.find('div', attrs={'id':'infoset_specific'})
print(prd_detail)

<div class="gd_infoSet infoSet_noLine" id="infoset_specific">
<div class="tm_infoSet">
<h4 class="tit_txt">품목정보</h4>
</div>
<div class="infoSetCont_wrap">
<div class="yesTb">
<table class="tb_nor tb_vertical" summary="품목정보 국내도서, 외국도서 " width="100%">
<caption>품목정보</caption>
<colgroup>
<col width="170"/>
<col width="*"/>
</colgroup>
<tbody class="b_size">
<tr>
<th class="txt" scope="row">발행일</th>
<td class="txt lastCol">2019년 06월 24일</td>
</tr>
<tr>
<th class="txt" scope="row">쪽수, 무게, 크기</th>
<td class="txt lastCol">330쪽 | 496g | 130*198*30mm</td>
</tr>
<tr>
<th class="txt" scope="row">ISBN13</th>
<td class="txt lastCol">9791190090018</td>
</tr>
<tr>
<th class="txt" scope="row">ISBN10</th>
<td class="txt lastCol">1190090015</td>
</tr>
</tbody>
</table>
</div>
</div>
<script type="text/javascript">
        if ($("#infoset_specific table tbody tr").length == 0) {
            $("#infoset_specific").remove();
        }
    </script>
</div>

In [410]:

prd_tr_list = prd_detail.find_all('tr')
print(prd_tr_list)

[<tr>
<th class="txt" scope="row">발행일</th>
<td class="txt lastCol">2019년 06월 24일</td>
</tr>, <tr>
<th class="txt" scope="row">쪽수, 무게, 크기</th>
<td class="txt lastCol">330쪽 | 496g | 130*198*30mm</td>
</tr>, <tr>
<th class="txt" scope="row">ISBN13</th>
<td class="txt lastCol">9791190090018</td>
</tr>, <tr>
<th class="txt" scope="row">ISBN10</th>
<td class="txt lastCol">1190090015</td>
</tr>]

In [412]:

for tr in prd_tr_list:
    if tr.find('th').get_text() == '쪽수, 무게, 크기':
        page_td = tr.find('td').get_text()
        break

print(page_td)

330쪽 | 496g | 130*198*30mm

In [414]:

print(page_td.split()[0])

330쪽

추가 숙제¶

In [417]:

def get_page_cnt(isbn):
    url = 'http://www.yes24.com/Product/Search?domain=BOOK&query={}'
    r = requests.get(url.format(isbn))
    soup = BeautifulSoup(r.text, 'html.parser')
    prd_info = soup.find('a', attrs={'class':'gd_name'})
    if prd_info == None:
        return ''
    url = 'http://www.yes24.com' + prd_info['href']
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    prd_detail = soup.find('div', attrs={'id':'infoset_specific'})
    prd_tr_list = prd_detail.find_all('tr')
    for tr in prd_tr_list:
        if tr.find('th').get_text() == '쪽수, 무게, 크기':
            return tr.find('td').get_text().split()[0]
    return ''

get_page_cnt(9791190090018)

Out[417]:

'330쪽'

In [419]:

books = books_df.loc[:, 'no':'isbn13']
top10_books = books.head(10)
print(top10_books)

   no  ranking                     bookname                  authors  \
0   1        1  우리가 빛의 속도로 갈 수 없다면 :김초엽 소설                  지은이: 김초엽   
1   2        2         달러구트 꿈 백화점.이미예 장편소설                  지은이: 이미예   
2   3        3          지구에서 한아뿐 :정세랑 장편소설                  지은이: 정세랑   
3   4        4           시선으로부터, :정세랑 장편소설                  지은이: 정세랑   
4   5        5               아몬드 :손원평 장편소설                  지은이: 손원평   
5   6        6            피프티 피플 :정세랑 장편소설                  지은이: 정세랑   
6   7        7          목소리를 드릴게요 :정세랑 소설집                  지은이: 정세랑   
7   8        8  나미야 잡화점의 기적 :히가시노 게이고 장편소설   지은이: 히가시노 게이고 ;옮긴이: 양윤옥   
8   9        9                   선량한 차별주의자                    김지혜 지음   
9  10        9              쇼코의 미소 :최은영 소설                  지은이: 최은영   

  publisher publication_year         isbn13  
0        허블             2019  9791190090018  
1     팩토리나인             2020  9791165341909  
2        난다             2019  9791188862290  
3      문학동네             2020  9788954672214  
4        창비             2017  9788936434267  
5        창비             2016  9788936434243  
6        아작             2020  9791165300005  
7      현대문학             2012  9791167901484  
8        창비             2019  9788936477196  
9      문학동네             2016  9788954641630

In [421]:

def get_page_cnt2(row):
    isbn = row['isbn13']
    return get_page_cnt(isbn)

In [425]:

page_count = top10_books.apply(get_page_cnt2, axis=1)
print(page_count)
# 나미 잡화점의 기적 isbn13 숫자 잘못 표시되어 있어 '9791167901484'로 수정함

0    330쪽
1    300쪽
2    228쪽
3    340쪽
4    264쪽
5    396쪽
6    272쪽
7    456쪽
8    244쪽
9    296쪽
dtype: object

In [427]:

page_count.name = 'page_count'
print(page_count)

0    330쪽
1    300쪽
2    228쪽
3    340쪽
4    264쪽
5    396쪽
6    272쪽
7    456쪽
8    244쪽
9    296쪽
Name: page_count, dtype: object

In [445]:

top10_with_page_count = pd.merge(top10_books, page_count, left_index=True, right_index=True)
top10_with_page_count

Out[445]:

	no	ranking	bookname	authors	publisher	publication_year	isbn13	page_count
0	1	1	우리가 빛의 속도로 갈 수 없다면 :김초엽 소설	지은이: 김초엽	허블	2019	9791190090018	330쪽
1	2	2	달러구트 꿈 백화점.이미예 장편소설	지은이: 이미예	팩토리나인	2020	9791165341909	300쪽
2	3	3	지구에서 한아뿐 :정세랑 장편소설	지은이: 정세랑	난다	2019	9791188862290	228쪽
3	4	4	시선으로부터, :정세랑 장편소설	지은이: 정세랑	문학동네	2020	9788954672214	340쪽
4	5	5	아몬드 :손원평 장편소설	지은이: 손원평	창비	2017	9788936434267	264쪽
5	6	6	피프티 피플 :정세랑 장편소설	지은이: 정세랑	창비	2016	9788936434243	396쪽
6	7	7	목소리를 드릴게요 :정세랑 소설집	지은이: 정세랑	아작	2020	9791165300005	272쪽
7	8	8	나미야 잡화점의 기적 :히가시노 게이고 장편소설	지은이: 히가시노 게이고 ;옮긴이: 양윤옥	현대문학	2012	9791167901484	456쪽
8	9	9	선량한 차별주의자	김지혜 지음	창비	2019	9788936477196	244쪽
9	10	9	쇼코의 미소 :최은영 소설	지은이: 최은영	문학동네	2016	9788954641630	296쪽

기본숙제¶

In [472]:

data = {
    'col1':['a','b','c'],
    'col2':[1,2,3],
}
df = pd.DataFrame(data)

print("1)")
print(df.loc[[0,1,2],['col1','col2']])
print("\n2)")
print(df.loc[0:2,'col1':'col2'])
print("\n3)")
print(df.loc[:2,[True,True]])
print("\n4)")
print(df.loc[::2,'col1':'col2']) #정답: 4번

1)
  col1  col2
0    a     1
1    b     2
2    c     3

2)
  col1  col2
0    a     1
1    b     2
2    c     3

3)
  col1  col2
0    a     1
1    b     2
2    c     3

4)
  col1  col2
0    a     1
2    c     3

[혼공파] 6주차_혼공분석 (0)	2025.03.05
[혼공파] 5주차_혼공분석 (1)	2025.03.05
[혼공파] 4주차_혼공분석 (0)	2025.02.09
[혼공파] 3주차_혼공분석 (0)	2025.01.26

bbbakery 님의 블로그

[혼공파] 2주차_혼공분석

JSON¶

XML¶

HTML¶

추가 숙제¶

기본숙제¶

'[혼공] 데이터분석' 카테고리의 다른 글

'[혼공] 데이터분석'의 다른글

티스토리툴바

« 2025/07 »
일	월	화	수	목	금	토
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31

[혼공파] 2주차_혼공분석

JSON¶

XML¶

HTML¶

추가 숙제¶

기본숙제¶

'[혼공] 데이터분석' 카테고리의 다른 글

'[혼공] 데이터분석'의 다른글

관련글

티스토리툴바