<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
  <channel>
    <title>BigData</title>
    <link>https://bigdata0seok.tistory.com/</link>
    <description></description>
    <language>ko</language>
    <pubDate>Fri, 3 Jul 2026 00:04:05 +0900</pubDate>
    <generator>TISTORY</generator>
    <ttl>100</ttl>
    <managingEditor>seo0seok</managingEditor>
    <image>
      <title>BigData</title>
      <url>https://tistory1.daumcdn.net/tistory/6147677/attach/9ec2fa7e7f994bfc87c0db07fb9c9767</url>
      <link>https://bigdata0seok.tistory.com</link>
    </image>
    <item>
      <title>[Python] 이미지 데이터셋 수집 방법 3가지</title>
      <link>https://bigdata0seok.tistory.com/41</link>
      <description>&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-filename=&quot;다운로드.jfif&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/Rm6ZM/btsm1TSWh96/MKPsOvUIh7iwNZQUxjvSx1/img.jpg&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/Rm6ZM/btsm1TSWh96/MKPsOvUIh7iwNZQUxjvSx1/img.jpg&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/Rm6ZM/btsm1TSWh96/MKPsOvUIh7iwNZQUxjvSx1/img.jpg&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FRm6ZM%2Fbtsm1TSWh96%2FMKPsOvUIh7iwNZQUxjvSx1%2Fimg.jpg&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;366&quot; height=&quot;223&quot; data-filename=&quot;다운로드.jfif&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;&lt;span&gt;1. roboflow&lt;/span&gt;&lt;/b&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;&lt;span&gt;&lt;a href=&quot;https://public.roboflow.com/&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;https://public.roboflow.com/&lt;/a&gt;&lt;/span&gt;&lt;/b&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;&lt;span&gt;2. kaggle&lt;/span&gt;&lt;/b&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;&lt;span&gt;&lt;a href=&quot;https://www.kaggle.com/datasets&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;https://www.kaggle.com/datasets&lt;/a&gt;&lt;/span&gt;&lt;/b&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;3. 구글 이미지 크롤링&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1688971764682&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;import ssl
import os
import sys
import time
import urllib.request
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

ssl._create_default_https_context = ssl._create_unverified_context

searchKey = input('Search Keyword : ')

# 저장할 폴더 경로 설정
path = f&quot;./imgs/{searchKey}/images&quot;

try:
    # 중복되는 폴더명이 없다면 생성
    if not os.path.exists(path):
        os.makedirs(path)
    # 중복된다면 문구 출력 후 프로그램 종료
    else:
        print('이전에 같은 [검색어, 이미지 수]로 다운로드한 폴더가 존재합니다.')
        sys.exit(0)
except OSError:
    print('OS error')
    sys.exit(0)

options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
driver.get(&quot;https://www.google.co.kr/imghp?hl=ko&amp;amp;tab=wi&amp;amp;authuser=0&amp;amp;ogbl&quot;)
elem = driver.find_element(&quot;name&quot;, &quot;q&quot;)

elem.send_keys(searchKey)
elem.send_keys(Keys.RETURN)

SCROLL_PAUSE_TIME = 1
# Get scroll height
last_height = driver.execute_script(&quot;return document.body.scrollHeight&quot;)
while True:
    # Scroll down to bottom
    driver.execute_script(&quot;window.scrollTo(0, document.body.scrollHeight);&quot;)
    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)
    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script(&quot;return document.body.scrollHeight&quot;)
    selector = &quot;.mye4qd&quot;
    if new_height == last_height:
        try:
            driver.find_element(By.CSS_SELECTOR, selector).send_keys(Keys.ENTER)
        except:
            break
    last_height = new_height

images = driver.find_elements(By.CSS_SELECTOR, &quot;.rg_i.Q4LuWd&quot;)
print(&quot;Total images found:&quot;, len(images))

count = 0
for image in images:
    try:
        if count &amp;gt;= 500:
            break
        image.click()
        time.sleep(1)
        xpath = r'//*[@id=&quot;Sva75c&quot;]/div[2]/div[2]/div[2]/div[2]/c-wiz/div/div/div/div[3]/div[1]/a/img[1]'
        imgUrl = driver.find_element(By.XPATH, xpath).get_attribute(&quot;src&quot;)
        opener = urllib.request.build_opener()
        opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
        urllib.request.install_opener(opener)

        urllib.request.urlretrieve(imgUrl, f'{path}/{searchKey}{str(count+1)}.jpg')
        count += 1
        print(&quot;Downloaded image&quot;, count)
    except Exception as e:
        print('Error:', e)

driver.quit()&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 구글 이미지 크롤링 코드이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 코드 실행 후 크롤링할 검색명을 console창에 입력하면 구글 이미지 크롤링이 시작된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 자동으로 500개까지 크롤링 되게 코딩되어있지만 구글 보안 정책 때문에&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; 최대 400개 까지만 가능하다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 크롤링 도중 에러가 발생하는 이미지들도 많아 400개를 크롤링하여도 다운로드&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; 되는 이미지는 그것보다 적다.&amp;nbsp;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 필자는 파이참으로 실행하여 PycharmProjects 폴더에 이미지 폴더가 생성된다.&lt;/p&gt;</description>
      <category>Python/이미지 처리</category>
      <author>seo0seok</author>
      <guid isPermaLink="true">https://bigdata0seok.tistory.com/41</guid>
      <comments>https://bigdata0seok.tistory.com/41#entry41comment</comments>
      <pubDate>Mon, 10 Jul 2023 15:54:27 +0900</pubDate>
    </item>
    <item>
      <title>[Python] labelImg를 활용한 이미지 라벨링</title>
      <link>https://bigdata0seok.tistory.com/40</link>
      <description>&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-filename=&quot;다운로드.jfif&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/bxVWmZ/btsm8X7L8bp/ZqHQTbEMHvtw0xITpoKvvk/img.jpg&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/bxVWmZ/btsm8X7L8bp/ZqHQTbEMHvtw0xITpoKvvk/img.jpg&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/bxVWmZ/btsm8X7L8bp/ZqHQTbEMHvtw0xITpoKvvk/img.jpg&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FbxVWmZ%2Fbtsm8X7L8bp%2FZqHQTbEMHvtw0xITpoKvvk%2Fimg.jpg&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;366&quot; height=&quot;223&quot; data-filename=&quot;다운로드.jfif&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;&lt;span&gt;1. labelImg란&lt;/span&gt;&lt;/b&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span&gt;&amp;nbsp; - labelImg는 이미지 주석 도구로, 개발자들이 객체 감지 모델을 학습하기 위해 이미지에 &lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span&gt;&amp;nbsp; &amp;nbsp; 주석을 달 수 있도록 도와준느 프로그램이다. 이 프로그램은 영상 및 이미지 처리 작업에&lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span&gt;&amp;nbsp; &amp;nbsp; 많이 사용되며, 객체 감지 모델을 구축하는 데 필수적인 작업 중 하나이다.&amp;nbsp;&lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;&lt;span&gt;2.&lt;span&gt;&lt;span&gt; &lt;b&gt;&lt;span&gt;labelImg &lt;/span&gt;&lt;/b&gt;&lt;/span&gt;설치&lt;/span&gt;&lt;/span&gt;&lt;/b&gt;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;a href=&quot;https://github.com/tzutalin/labelImg&quot;&gt;https://github.com/tzutalin/labelImg&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-filename=&quot;YOLOv7 설치1.png&quot; data-origin-width=&quot;1896&quot; data-origin-height=&quot;564&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/bIjTge/btsmUJDwFz8/EgzRK6kTVsjkiEhk4Q9S0k/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/bIjTge/btsmUJDwFz8/EgzRK6kTVsjkiEhk4Q9S0k/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/bIjTge/btsmUJDwFz8/EgzRK6kTVsjkiEhk4Q9S0k/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FbIjTge%2FbtsmUJDwFz8%2FEgzRK6kTVsjkiEhk4Q9S0k%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;1896&quot; height=&quot;564&quot; data-filename=&quot;YOLOv7 설치1.png&quot; data-origin-width=&quot;1896&quot; data-origin-height=&quot;564&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 먼저 github에 들어가 Releases -&amp;gt; Binary v1.8.1을 클릭한다.&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-filename=&quot;제목 없음.png&quot; data-origin-width=&quot;1896&quot; data-origin-height=&quot;736&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/bg2kmw/btsm8ZLir0E/vWK5XPX17TS9sCYPMHETj0/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/bg2kmw/btsm8ZLir0E/vWK5XPX17TS9sCYPMHETj0/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/bg2kmw/btsm8ZLir0E/vWK5XPX17TS9sCYPMHETj0/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fbg2kmw%2Fbtsm8ZLir0E%2FvWK5XPX17TS9sCYPMHETj0%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;1896&quot; height=&quot;736&quot; data-filename=&quot;제목 없음.png&quot; data-origin-width=&quot;1896&quot; data-origin-height=&quot;736&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - windows_v 1.8.1.zip 파일을 다운로드 받는다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;1137&quot; data-origin-height=&quot;232&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/dqCntg/btsmUJp2P3m/x4DF0NK4akOp6q2wjcYwuk/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/dqCntg/btsmUJp2P3m/x4DF0NK4akOp6q2wjcYwuk/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/dqCntg/btsmUJp2P3m/x4DF0NK4akOp6q2wjcYwuk/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FdqCntg%2FbtsmUJp2P3m%2Fx4DF0NK4akOp6q2wjcYwuk%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;1137&quot; height=&quot;232&quot; data-origin-width=&quot;1137&quot; data-origin-height=&quot;232&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - zip 파일을 풀면 data, labelImg 두 파일이 생성된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - data 안에는 txt 파일이 들어있는데 이 곳은 라벨링할 클래스명을 적어주면된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp;(ex. monkey)&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - labelImg를 클릭하여 실행한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;&lt;span&gt;3.&lt;span&gt;&lt;span&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;&lt;span&gt;labelImg&lt;span&gt;&amp;nbsp;사용법&lt;/span&gt;&lt;/span&gt;&lt;/b&gt;&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/b&gt;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-filename=&quot;제목 없음.png&quot; data-origin-width=&quot;1901&quot; data-origin-height=&quot;1010&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/bE5C19/btsm8QnkE6o/krTtkwfmjKgdbvqYtj6jQ0/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/bE5C19/btsm8QnkE6o/krTtkwfmjKgdbvqYtj6jQ0/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/bE5C19/btsm8QnkE6o/krTtkwfmjKgdbvqYtj6jQ0/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FbE5C19%2Fbtsm8QnkE6o%2FkrTtkwfmjKgdbvqYtj6jQ0%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;1901&quot; height=&quot;1010&quot; data-filename=&quot;제목 없음.png&quot; data-origin-width=&quot;1901&quot; data-origin-height=&quot;1010&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #333333; text-align: start;&quot;&gt;&lt;span&gt;&amp;nbsp; &lt;/span&gt;- 프로그램 실행 시 첫 화면이다.&lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #333333; text-align: start;&quot;&gt;&amp;nbsp; - 빨간 네모칸 안에 형식을 바꿀수 있는데 우리는 이미지를 라벨링해 YOLO에서 사용할&lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #333333; text-align: start;&quot;&gt;&amp;nbsp; &amp;nbsp; 것이기 때문에 클릭하여 YOLO로 바꾸어 준다.&lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #333333; text-align: start;&quot;&gt;&amp;nbsp; - PascalVOC로 라벨링 시 XML 파일로 생성 / YOLO로 라벨링시 txt 파일이 생성된다.&lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #333333; text-align: start;&quot;&gt;&amp;nbsp; - 실수로 안바꾸면 재작업을 해야하니 꼭 바꿔주자.&lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-filename=&quot;제목 없음.png&quot; data-origin-width=&quot;1901&quot; data-origin-height=&quot;1010&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/1oIgj/btsm4Ccl7dV/rRCk5CCoxxwp5fnA2NERuk/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/1oIgj/btsm4Ccl7dV/rRCk5CCoxxwp5fnA2NERuk/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/1oIgj/btsm4Ccl7dV/rRCk5CCoxxwp5fnA2NERuk/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2F1oIgj%2Fbtsm4Ccl7dV%2FrRCk5CCoxxwp5fnA2NERuk%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;1901&quot; height=&quot;1010&quot; data-filename=&quot;제목 없음.png&quot; data-origin-width=&quot;1901&quot; data-origin-height=&quot;1010&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #333333; text-align: start;&quot;&gt;&amp;nbsp; &amp;nbsp;- 형식이 YOLO로 바뀐 모습이다.&lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-filename=&quot;제목 없음.png&quot; data-origin-width=&quot;1901&quot; data-origin-height=&quot;1010&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/MQ8Ds/btsm8PBXWUa/AZfUawkA3R9TTovUUhIQB0/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/MQ8Ds/btsm8PBXWUa/AZfUawkA3R9TTovUUhIQB0/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/MQ8Ds/btsm8PBXWUa/AZfUawkA3R9TTovUUhIQB0/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FMQ8Ds%2Fbtsm8PBXWUa%2FAZfUawkA3R9TTovUUhIQB0%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;1901&quot; height=&quot;1010&quot; data-filename=&quot;제목 없음.png&quot; data-origin-width=&quot;1901&quot; data-origin-height=&quot;1010&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - Open Dir 버튼을 클릭해 이미지 파일이 저장되어 있는 폴더를 지정해준다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-filename=&quot;제목 없음.png&quot; data-origin-width=&quot;1901&quot; data-origin-height=&quot;1010&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/FB5i5/btsm9uxktRN/XZ2eKmU0JNTfV1Lt7o1QfK/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/FB5i5/btsm9uxktRN/XZ2eKmU0JNTfV1Lt7o1QfK/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/FB5i5/btsm9uxktRN/XZ2eKmU0JNTfV1Lt7o1QfK/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FFB5i5%2Fbtsm9uxktRN%2FXZ2eKmU0JNTfV1Lt7o1QfK%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;1901&quot; height=&quot;1010&quot; data-filename=&quot;제목 없음.png&quot; data-origin-width=&quot;1901&quot; data-origin-height=&quot;1010&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - Change Save Dir 버튼을 클릭해 라벨링 된 txt파일 저장할 폴더를 지정해준다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;1901&quot; data-origin-height=&quot;1010&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/sXIFz/btsm82OBPG3/81n4pDgxdxCKy6l38WGTk1/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/sXIFz/btsm82OBPG3/81n4pDgxdxCKy6l38WGTk1/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/sXIFz/btsm82OBPG3/81n4pDgxdxCKy6l38WGTk1/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FsXIFz%2Fbtsm82OBPG3%2F81n4pDgxdxCKy6l38WGTk1%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;1901&quot; height=&quot;1010&quot; data-origin-width=&quot;1901&quot; data-origin-height=&quot;1010&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 경로 설정을 완료하면 이미지 폴더에 있는 이미지가 보인다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;lt;단축키&amp;gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 라벨링하면서 쓰는 단축키 4가지이다. 이 4가지만 알고 있어도 충분하다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; w 클릭 후 마우스 드래그 : 라벨링 범위 지정&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; a 버튼 : 이전 이미지로 이동&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; d 버튼 : 다음 이미지로 이동&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; ctrl + s : 저장하기&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-filename=&quot;제목 없음.png&quot; data-origin-width=&quot;1901&quot; data-origin-height=&quot;1010&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/n02b7/btsm828XkIG/0aJ4eu8jE0QYwdGKUUzG40/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/n02b7/btsm828XkIG/0aJ4eu8jE0QYwdGKUUzG40/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/n02b7/btsm828XkIG/0aJ4eu8jE0QYwdGKUUzG40/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fn02b7%2Fbtsm828XkIG%2F0aJ4eu8jE0QYwdGKUUzG40%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;1901&quot; height=&quot;1010&quot; data-filename=&quot;제목 없음.png&quot; data-origin-width=&quot;1901&quot; data-origin-height=&quot;1010&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 라벨링 범위를 지정한 모습이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 여기서 Save 버튼을 클릭하거나, ctrl + s를 클릭해 저장해준다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 간혹 저장 시 에러가 나는 이미지들이 있는데 그런 이미지들은 그냥 포기하고 넘어가준다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;1427&quot; data-origin-height=&quot;732&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/bgItbD/btsm8IQBwAR/NC8kPrjZ5SNRJg5KmxpsRK/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/bgItbD/btsm8IQBwAR/NC8kPrjZ5SNRJg5KmxpsRK/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/bgItbD/btsm8IQBwAR/NC8kPrjZ5SNRJg5KmxpsRK/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FbgItbD%2Fbtsm8IQBwAR%2FNC8kPrjZ5SNRJg5KmxpsRK%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;1427&quot; height=&quot;732&quot; data-origin-width=&quot;1427&quot; data-origin-height=&quot;732&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 라벨링이 완료된 후 생성된 txt 파일이다.&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 맨 앞 숫자 0은 각 클래스를 숫자로 지정해준다.&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 그 뒤로 나오는 숫자 4개는 객체의 경계 상자 정보이다.&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 일반적으로 (x_min, y_min, x_max, y_max) 형식으로 표현되며, 이는 경계 상자가 왼쪽 위&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; 모서리의 (x_min, y_min) 좌표와 오른쪽 아래 모서리의 (x_max, y_max) 좌표로 정의된다.&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;지금까지 labelImg 프로그램으로 이미지 라벨링 하는법을 알아보았다.&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;이렇게 작업한 이미지와 라벨링한 txt을 YOLOv7을 활용해 이미지 분석이 가능하다.&lt;/p&gt;</description>
      <category>Python/이미지 처리</category>
      <author>seo0seok</author>
      <guid isPermaLink="true">https://bigdata0seok.tistory.com/40</guid>
      <comments>https://bigdata0seok.tistory.com/40#entry40comment</comments>
      <pubDate>Mon, 10 Jul 2023 15:40:02 +0900</pubDate>
    </item>
    <item>
      <title>[Python] YOLOv7을 활용한 이미지 분석</title>
      <link>https://bigdata0seok.tistory.com/39</link>
      <description>&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/VFAEY/btsm2IQITQf/Glf6kLVV7Y7muoKeKi4kKK/img.jpg&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/VFAEY/btsm2IQITQf/Glf6kLVV7Y7muoKeKi4kKK/img.jpg&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/VFAEY/btsm2IQITQf/Glf6kLVV7Y7muoKeKi4kKK/img.jpg&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FVFAEY%2Fbtsm2IQITQf%2FGlf6kLVV7Y7muoKeKi4kKK%2Fimg.jpg&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;366&quot; height=&quot;223&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;&lt;span&gt;1. YOLOv7이란&lt;/span&gt;&lt;/b&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span&gt;&amp;nbsp; - YOLOv7은 객체 탐지 알고리즘 중 하나인 You Only Look Once (YOLO)의 버전 7이다.&lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 실시간으로 탐지가 가능하고 속도가 빠르다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 다중 객체 탐지가 가능하다. 한 이미지에서 여러 객체를 동시에 탐지할 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 객체의 크기와 종횡비에 강인해 작은 객체나 다양한 크기의 객체도 정확히 탐지가 가능하다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;&lt;span&gt;2.&lt;span&gt; YOLOv7 설치&lt;/span&gt;&lt;/span&gt;&lt;/b&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span&gt;&lt;span style=&quot;color: #0563c1;&quot;&gt;&lt;u&gt;&lt;/u&gt;&lt;u&gt;&lt;a href=&quot;https://github.com/WongKinYiu/yolov7&quot;&gt;https://github.com/WongKinYiu/yolov7&lt;/a&gt;&lt;/u&gt;&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-filename=&quot;YOLOv7 설치1.png&quot; data-origin-width=&quot;1896&quot; data-origin-height=&quot;801&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/bf2kyw/btsm2HRTLEo/J4Qg6MaZ2KqsJqE5pkLyHK/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/bf2kyw/btsm2HRTLEo/J4Qg6MaZ2KqsJqE5pkLyHK/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/bf2kyw/btsm2HRTLEo/J4Qg6MaZ2KqsJqE5pkLyHK/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fbf2kyw%2Fbtsm2HRTLEo%2FJ4Qg6MaZ2KqsJqE5pkLyHK%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;1896&quot; height=&quot;801&quot; data-filename=&quot;YOLOv7 설치1.png&quot; data-origin-width=&quot;1896&quot; data-origin-height=&quot;801&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 먼저 github에 들어가 zip파일을 다운로드 받는다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;&lt;span&gt;&lt;span&gt;&lt;span&gt;3. Jupyter Notebook&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1688964919186&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;import os
os.getcwd()

실행 결과 :
'/storage01/shared_data/users/youngseok/YOLOv7'&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span&gt;&lt;span&gt;&lt;span&gt;&amp;nbsp; - 주피터 노트북에 다운로드 받은 YOLOv7을 업로드 한 후 os.getcwd()를 사용해&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span&gt;&lt;span&gt;&lt;span&gt;&amp;nbsp; &amp;nbsp; 현재 경로를 찾는다.&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1688965001505&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;import shutil

filename = './YOLOv7.zip' # 압축 해제할 파일
extract_dir = './YOLOv7/' # 압축 해제 시 폴더 이름
archive_format = 'zip'

shutil.unpack_archive(filename, extract_dir, archive_format)&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span&gt;&lt;span&gt;&lt;span&gt;&amp;nbsp; - 현재 경로를 찾은 후 위 코드에서 압축 해제할 zip파일과 압축 해제 시 폴더이름, format을&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span&gt;&lt;span&gt;&lt;span&gt;&amp;nbsp; &amp;nbsp; 설정해주고 &lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span&gt;&lt;span&gt;&lt;span&gt;실행하면 YOLOv7 zip파일 해제가 된다.&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1688965188310&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;!pip install -r requirements.txt&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 위 코드를 실행해 필요한 라이브러리들을 한번에 설치한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1688965244538&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;import wget
wget.download('https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt')&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 위 코드는 YOLOv7에서 필요한 가중치들을 다운로드 받는 코드이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1688965319241&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;!python train.py --device 6,7 --batch-size 16 --epochs 40 --img 640 
--data ../data_monkey.yaml --weights yolov7.pt&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - YOLOv7에 있는 train.py 코드를 실행 시켜준다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - device : 사용할 GPU 번호 (0부터 시작)&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - batch-size : 한 번의 모델 업데이트에 사용되는 데이터의 수&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; -&amp;gt; 높을수록 메모리 사용량이 늘고 연산 시간이 길어진다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - epochs : 전체 학습 데이터를 몇 번 반복할지&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; -&amp;gt; 높을수록 학습 결과가 좋아지지만 과적합 위험성 있음&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - img : 이미지 크기&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; -&amp;gt; 클수록 학습 결과가 좋아질 수 있지만 연산 시간이 길어진다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - data : yaml 파일 경로&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - weights : 가중치&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; -&amp;gt; 다운로드 받은 yolov7.pt를 사용해준다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1688965852496&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;train: ../object/monkey/monkey/train/images
val: ../object/monkey/monkey/valid/images
test: ../object/monkey/monkey/test/images

nc: 1
names: ['monkey']&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - yaml 파일이란 데이터의 구조와 계층을 표현하기 위해 들여쓰기를 사용하며,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; 주로 설정 파일이나 데이터 전송 형식으로 사용된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - train, val, test 폴더의 경로를 지정해준다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - nc : class 개수&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - names : class 명&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 이미지에서 원숭이 한 객체만 탐지하기 위해 개수는 1, class 명은 monkey로 설정해주었다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - train, val, test 폴더에는 images, labels 두 폴더로 이루어져있다.&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; -&amp;gt; images 폴더에는 원숭이 이미지, labels 폴더에는 원숭이 이미지를 라벨링하고&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;난 후&amp;nbsp;좌표값 txt파일이 들어있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1688966280819&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;0 0.500000 0.485455 0.387978 0.890909&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 라벨링 된 txt 파일 예시이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 맨 앞 숫자 0은 각 클래스를 숫자로 지정해준다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 그 뒤로 나오는 숫자 4개는 객체의 경계 상자 정보이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 일반적으로 (x_min, y_min, x_max, y_max) 형식으로 표현되며, 이는 경계 상자가 왼쪽 위&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; 모서리의 (x_min, y_min) 좌표와 오른쪽 아래 모서리의 (x_max, y_max) 좌표로 정의된다.&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - train.py 코드로 학습을 시키면 YOLOv7 -&amp;gt; runs -&amp;gt; train 폴더 순으로 들어가보면&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; exp1 ~ exp** 폴더가 생성된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - exp 폴더 안에 weights(가중치) / result.png / confusion_matrix.png / R_curve /&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; P_curve / PR_curve / F1_curve 등 모델 성능 평가 지표들이 있다.&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - weights 폴더 안에 best.pt는 학습 중 가장 좋은 결과가 나온 가중치 정보가 저장되어 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; 이는 뒤에 detect.py 코드 실행 시 사용된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1688966894785&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;!python detect.py --device 6,7 --weights runs/train/exp33/weights/best.pt 
--conf 0.2 --img-size 640 --source ../object/monkey/monkey/test/images&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - train.py 코드로 학습이 끝난 후 detect.py 코드를 실행 시켜준다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - device : 사용할 GPU 번호 (0부터 시작)&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - weights : 가중치 경로&amp;nbsp; / exp는 학습할 때마다 생기므로 번호를 맞춰 써줘야한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; -&amp;gt; 가장 좋은 결과가 나왔던 가중치 best.pt를 사용해준다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - conf : 객체 탐지에 대한 신뢰도 임계값으로 0.2보다 높은 객체만 탐지 결과로 출력된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - img : 이미지의크기&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - source : test할 이미지 폴더 경로&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-filename=&quot;P_curve.png&quot; data-origin-width=&quot;2250&quot; data-origin-height=&quot;1500&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/cHXMrR/btsm9vJDbyL/R2yhRzkbIpZ63kOKukqKy0/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/cHXMrR/btsm9vJDbyL/R2yhRzkbIpZ63kOKukqKy0/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/cHXMrR/btsm9vJDbyL/R2yhRzkbIpZ63kOKukqKy0/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FcHXMrR%2Fbtsm9vJDbyL%2FR2yhRzkbIpZ63kOKukqKy0%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;2250&quot; height=&quot;1500&quot; data-filename=&quot;P_curve.png&quot; data-origin-width=&quot;2250&quot; data-origin-height=&quot;1500&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - P-curve 그래프를 시각화 한 것으로 0.904가 나왔다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 이는 모델이 양성으로 예측한 대상 중 실제로 90.4%가 양성이라는 의미이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-filename=&quot;R_curve.png&quot; data-origin-width=&quot;2250&quot; data-origin-height=&quot;1500&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/dZl5CD/btsmUHZTG5L/gEabb1MDiIvpNZkn2KOGS1/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/dZl5CD/btsmUHZTG5L/gEabb1MDiIvpNZkn2KOGS1/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/dZl5CD/btsmUHZTG5L/gEabb1MDiIvpNZkn2KOGS1/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FdZl5CD%2FbtsmUHZTG5L%2FgEabb1MDiIvpNZkn2KOGS1%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;2250&quot; height=&quot;1500&quot; data-filename=&quot;R_curve.png&quot; data-origin-width=&quot;2250&quot; data-origin-height=&quot;1500&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - R-curve 그래프를 시각화 한 것으로 0.96 나왔다.&lt;/p&gt;
&lt;p style=&quot;color: #333333; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 이는 실제 양성인 대상을 예측한 확률이 96%라는 의미로 모델이 양성 샘플을&lt;/p&gt;
&lt;p style=&quot;color: #333333; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; 거의 다 찾았다고 볼 수 있다.&lt;/p&gt;
&lt;p style=&quot;color: #333333; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-filename=&quot;PR_curve.png&quot; data-origin-width=&quot;2250&quot; data-origin-height=&quot;1500&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/tcmvH/btsm8ZRV8NU/nY1pIc0IABhIg1ztSKK8bk/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/tcmvH/btsm8ZRV8NU/nY1pIc0IABhIg1ztSKK8bk/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/tcmvH/btsm8ZRV8NU/nY1pIc0IABhIg1ztSKK8bk/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FtcmvH%2Fbtsm8ZRV8NU%2FnY1pIc0IABhIg1ztSKK8bk%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;2250&quot; height=&quot;1500&quot; data-filename=&quot;PR_curve.png&quot; data-origin-width=&quot;2250&quot; data-origin-height=&quot;1500&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - PR-curve 그래프를 시각화 한 것으로 0.8590 나왔다.&lt;/p&gt;
&lt;p style=&quot;color: #333333; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 정밀도와 재현율 사이의 조화 평균이 F1 Score를 나타내며, 모델의 전반적인 성능을 의미한다.&lt;/p&gt;
&lt;p style=&quot;color: #333333; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-filename=&quot;다운로드 (1).jfif&quot; data-origin-width=&quot;224&quot; data-origin-height=&quot;224&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/bI8euQ/btsm8hk7XsY/FTcRKbX5nKKMZwaZP8494K/img.jpg&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/bI8euQ/btsm8hk7XsY/FTcRKbX5nKKMZwaZP8494K/img.jpg&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/bI8euQ/btsm8hk7XsY/FTcRKbX5nKKMZwaZP8494K/img.jpg&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FbI8euQ%2Fbtsm8hk7XsY%2FFTcRKbX5nKKMZwaZP8494K%2Fimg.jpg&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;400&quot; height=&quot;400&quot; data-filename=&quot;다운로드 (1).jfif&quot; data-origin-width=&quot;224&quot; data-origin-height=&quot;224&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 원숭이 이미지 학습 후 테스트 이미지를 넣었을때 원숭이 객체들을 잡아주는 것을 알 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 지금은 하나의 객체로 하였지만 한 이미지에서 여러 객체 탐지도 동시에 가능하다.&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 성능이 좋게 나온 결과로 작성하였지만 직접 해보면 성능이 잘 나오지 않는다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 이미지 분석을 하면서 성능을 높이기 위해 해본 방법들은 이미지 데이터 개수 추가,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; 고품질 이미지 사용, 일관성 있는 라벨링, epochs 늘리기 등이 있다.&amp;nbsp; &amp;nbsp;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; - 이 중 고품질 이미지 사용과 많은 이미지 데이터 사용이 가장 효과가 좋았다.&amp;nbsp;&lt;/p&gt;</description>
      <category>Python/이미지 처리</category>
      <author>seo0seok</author>
      <guid isPermaLink="true">https://bigdata0seok.tistory.com/39</guid>
      <comments>https://bigdata0seok.tistory.com/39#entry39comment</comments>
      <pubDate>Mon, 10 Jul 2023 13:34:43 +0900</pubDate>
    </item>
    <item>
      <title>[작업형2] 중고 자동차 가격 예측하기(회귀)</title>
      <link>https://bigdata0seok.tistory.com/38</link>
      <description>&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/blv6IT/btsk9dzhz1d/skzFX2bh15zaIiZZiBnSXK/img.jpg&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/blv6IT/btsk9dzhz1d/skzFX2bh15zaIiZZiBnSXK/img.jpg&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/blv6IT/btsk9dzhz1d/skzFX2bh15zaIiZZiBnSXK/img.jpg&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fblv6IT%2Fbtsk9dzhz1d%2FskzFX2bh15zaIiZZiBnSXK%2Fimg.jpg&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;366&quot; height=&quot;223&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p style=&quot;text-align: center;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #ee2323;&quot;&gt;&lt;b&gt;// 퇴근후딴짓 님의 강의를 참고하였습니다. //&lt;/b&gt;&lt;/span&gt;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: center;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #555555; text-align: start;&quot;&gt;Dataset :&lt;/span&gt;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;fileblock&quot; data-ke-align=&quot;alignCenter&quot;&gt;&lt;a href=&quot;https://blog.kakaocdn.net/dn/btByex/btslboGfRwJ/W5KF9tN4JTW6XzSKnKWgDK/test.csv?attach=1&amp;amp;knm=tfile.csv&quot; class=&quot;&quot;&gt;
    &lt;div class=&quot;image&quot;&gt;&lt;/div&gt;
    &lt;div class=&quot;desc&quot;&gt;&lt;div class=&quot;filename&quot;&gt;&lt;span class=&quot;name&quot;&gt;test.csv&lt;/span&gt;&lt;/div&gt;
&lt;div class=&quot;size&quot;&gt;0.07MB&lt;/div&gt;
&lt;/div&gt;
  &lt;/a&gt;&lt;/figure&gt;
&lt;figure class=&quot;fileblock&quot; data-ke-align=&quot;alignCenter&quot;&gt;&lt;a href=&quot;https://blog.kakaocdn.net/dn/cdAB3H/btslagB4qAQ/u2dORKhr3d5Adf4Y3rDaK0/train.csv?attach=1&amp;amp;knm=tfile.csv&quot; class=&quot;&quot;&gt;
    &lt;div class=&quot;image&quot;&gt;&lt;/div&gt;
    &lt;div class=&quot;desc&quot;&gt;&lt;div class=&quot;filename&quot;&gt;&lt;span class=&quot;name&quot;&gt;train.csv&lt;/span&gt;&lt;/div&gt;
&lt;div class=&quot;size&quot;&gt;0.18MB&lt;/div&gt;
&lt;/div&gt;
  &lt;/a&gt;&lt;/figure&gt;
&lt;figure class=&quot;fileblock&quot; data-ke-align=&quot;alignCenter&quot;&gt;&lt;a href=&quot;https://blog.kakaocdn.net/dn/cmBkoD/btslaQ36aXJ/I9lMqn7d2GATq0D56RImrk/y.csv?attach=1&amp;amp;knm=tfile.csv&quot; class=&quot;&quot;&gt;
    &lt;div class=&quot;image&quot;&gt;&lt;/div&gt;
    &lt;div class=&quot;desc&quot;&gt;&lt;div class=&quot;filename&quot;&gt;&lt;span class=&quot;name&quot;&gt;y.csv&lt;/span&gt;&lt;/div&gt;
&lt;div class=&quot;size&quot;&gt;0.01MB&lt;/div&gt;
&lt;/div&gt;
  &lt;/a&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제) 중고 자동차 가격을&lt;b&gt;&amp;nbsp;예측하여 다음과 같은 형식으로 제출하시오.&lt;/b&gt;&lt;/b&gt;&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc; background-color: #ffffff; color: #212121; text-align: start;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;자동차 가격을 예측해주세요!&lt;/li&gt;
&lt;/ul&gt;
&lt;ul style=&quot;list-style-type: disc; background-color: #ffffff; color: #212121; text-align: start;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;예측할 값(y): price&lt;/li&gt;
&lt;li&gt;평가: RMSE (Root Mean Squared Error)&lt;/li&gt;
&lt;li&gt;data: train.csv, test.csv&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;76&quot; data-origin-height=&quot;207&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/bN6rZV/btsk84oTmPX/AP1um2Ysy5Gkn8gAxrzAU1/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/bN6rZV/btsk84oTmPX/AP1um2Ysy5Gkn8gAxrzAU1/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/bN6rZV/btsk84oTmPX/AP1um2Ysy5Gkn8gAxrzAU1/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FbN6rZV%2Fbtsk84oTmPX%2FAP1um2Ysy5Gkn8gAxrzAU1%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;76&quot; height=&quot;207&quot; data-origin-width=&quot;76&quot; data-origin-height=&quot;207&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;1. EDA&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687545547231&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# 데이터 불러오기
import pandas as pd

train = pd.read_csv(&quot;train.csv&quot;)
test = pd.read_csv(&quot;test.csv&quot;)&lt;/code&gt;&lt;/pre&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;800&quot; data-origin-height=&quot;235&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/cQ22p5/btslbRal0zX/SLH2tcSVuznIy0e3GVxIc0/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/cQ22p5/btslbRal0zX/SLH2tcSVuznIy0e3GVxIc0/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/cQ22p5/btslbRal0zX/SLH2tcSVuznIy0e3GVxIc0/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FcQ22p5%2FbtslbRal0zX%2FSLH2tcSVuznIy0e3GVxIc0%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;800&quot; height=&quot;235&quot; data-origin-width=&quot;800&quot; data-origin-height=&quot;235&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687545595214&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;train.info()

실행 결과 : 
&amp;lt;class 'pandas.core.frame.DataFrame'&amp;gt;
RangeIndex: 3759 entries, 0 to 3758
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         3759 non-null   object 
 1   year          3759 non-null   int64  
 2   price         3759 non-null   int64  
 3   transmission  3759 non-null   object 
 4   mileage       3759 non-null   int64  
 5   fuelType      3759 non-null   object 
 6   tax           3759 non-null   int64  
 7   mpg           3759 non-null   float64
 8   engineSize    3759 non-null   float64
dtypes: float64(2), int64(4), object(3)&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 6개의 수치형 변수, 3개의 명목형 변수로 이루어져있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687545675721&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;train.isnull().sum()

실행 결과 : 
model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64&lt;/code&gt;&lt;/pre&gt;
&lt;pre id=&quot;code_1687545688156&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;test.isnull().sum()

실행 결과 : 
model           0
year            0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; train, test 데이터 모두 결측치는 없다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687545743173&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;y_train = train.pop(&quot;price&quot;)&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 가격 예측에 사용될 'price' 컬럼은 y_train 변수에 따로 담아두고 train 데이터에서 삭제한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;2. 수치형 활용&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687545792383&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;cols = ['year', 'mileage', 'tax', 'mpg', 'engineSize']
train = train[cols]
test = test[cols]&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; train, test 데이터에서 수치형 변수들만 선택한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687545819111&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(train, y_train, test_size=0.2, random_state=2022)
X_tr.shape, X_val.shape, y_tr.shape, y_val.shape&lt;/code&gt;&lt;/pre&gt;
&lt;pre id=&quot;code_1687545826569&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_tr, y_tr)
pred = rf.predict(X_val)&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 랜덤포레스트 회귀 모형으로 훈련 및 예측을 실시한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687545832041&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)**0.5&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 사이킷런에서 rmse는 제공하고 있지 않아 함수로 rmse 계산 방식을 만들어준다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687545842753&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;rmse(y_val, pred)

실행 결과 :
1565.0567336921324&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 검증 데이터로 평가 시 1565가 나온다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;3. 수치형 + 범주형 활용&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687545932241&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;train = pd.get_dummies(train)
test = pd.get_dummies(test)&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; train, test 데이터를 get_dummies 함수를 사용해 원핫 인코딩을 진행한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687545964314&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(train, y_train, test_size=0.2, random_state=2022)
X_tr.shape, X_val.shape, y_tr.shape, y_val.shape&lt;/code&gt;&lt;/pre&gt;
&lt;pre id=&quot;code_1687545971189&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_tr, y_tr)
pred = rf.predict(X_val)
rmse(y_val, pred)&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 원핫인코딩 진행 후 평가 진행시 rmse는 1305가 나온다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; rmse는 오차에 기반하기 때문에 숫자가 낮을수록 좋다. 즉, 원핫 인코딩 진행 후 더 좋은 성능이 나온 것이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;4. Test 예측&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687546073564&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;pred = rf.predict(test)
result = pd.DataFrame({
    'pred':pred
})
result.to_csv(&quot;result.csv&quot;, index=False)&lt;/code&gt;&lt;/pre&gt;
&lt;pre id=&quot;code_1687546081396&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;pd.read_csv('result.csv')&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; &lt;span style=&quot;color: #222222; text-align: start;&quot;&gt;문제에서 제시한 형식대로 'pred' 컬럼에 pred값을 넣어 데이터프레임으로 만든 후 제출&lt;/span&gt;&lt;/p&gt;</description>
      <category>빅데이터분석기사/작업형2</category>
      <author>seo0seok</author>
      <guid isPermaLink="true">https://bigdata0seok.tistory.com/38</guid>
      <comments>https://bigdata0seok.tistory.com/38#entry38comment</comments>
      <pubDate>Sat, 24 Jun 2023 03:49:11 +0900</pubDate>
    </item>
    <item>
      <title>[작업형2] 자동차 시장 세분화(분류)</title>
      <link>https://bigdata0seok.tistory.com/37</link>
      <description>&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/PCACn/btslbR2uE7R/SkpPBSfBlkDzElp3F4Y0wk/img.jpg&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/PCACn/btslbR2uE7R/SkpPBSfBlkDzElp3F4Y0wk/img.jpg&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/PCACn/btslbR2uE7R/SkpPBSfBlkDzElp3F4Y0wk/img.jpg&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FPCACn%2FbtslbR2uE7R%2FSkpPBSfBlkDzElp3F4Y0wk%2Fimg.jpg&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;366&quot; height=&quot;223&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p style=&quot;text-align: center;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #ee2323;&quot;&gt;&lt;b&gt;// 퇴근후딴짓 님의 강의를 참고하였습니다. //&lt;/b&gt;&lt;/span&gt;&lt;/p&gt;
&lt;p style=&quot;text-align: center;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #555555; text-align: start;&quot;&gt;Dataset :&lt;/span&gt;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;fileblock&quot; data-ke-align=&quot;alignCenter&quot;&gt;&lt;a href=&quot;https://blog.kakaocdn.net/dn/m3LTc/btsk8LJUOTj/BH1gq9Ayw5URA2j3k6JqH1/test.csv?attach=1&amp;amp;knm=tfile.csv&quot; class=&quot;&quot;&gt;
    &lt;div class=&quot;image&quot;&gt;&lt;/div&gt;
    &lt;div class=&quot;desc&quot;&gt;&lt;div class=&quot;filename&quot;&gt;&lt;span class=&quot;name&quot;&gt;test.csv&lt;/span&gt;&lt;/div&gt;
&lt;div class=&quot;size&quot;&gt;0.11MB&lt;/div&gt;
&lt;/div&gt;
  &lt;/a&gt;&lt;/figure&gt;
&lt;figure class=&quot;fileblock&quot; data-ke-align=&quot;alignCenter&quot;&gt;&lt;a href=&quot;https://blog.kakaocdn.net/dn/bZHvqn/btsk83cA9va/ZQhuvzdxmy0ryyu0vuCIGk/train.csv?attach=1&amp;amp;knm=tfile.csv&quot; class=&quot;&quot;&gt;
    &lt;div class=&quot;image&quot;&gt;&lt;/div&gt;
    &lt;div class=&quot;desc&quot;&gt;&lt;div class=&quot;filename&quot;&gt;&lt;span class=&quot;name&quot;&gt;train.csv&lt;/span&gt;&lt;/div&gt;
&lt;div class=&quot;size&quot;&gt;0.35MB&lt;/div&gt;
&lt;/div&gt;
  &lt;/a&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제) 신규 고객이 어떤 분류에 속할지 예측하여 다음과 같은 형식으로 제출하시오.&lt;/b&gt;&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc; background-color: #ffffff; color: #212121; text-align: start;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;자동차 회사는 새로운 전략을 수립하기 위해 4개의 시장으로 세분화했습니다.&lt;/li&gt;
&lt;li&gt;기존 고객 분류 자료를 바탕으로 신규 고객이 어떤 분류에 속할지 예측해주세요!&lt;/li&gt;
&lt;/ul&gt;
&lt;ul style=&quot;list-style-type: disc; background-color: #ffffff; color: #212121; text-align: start;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;예측할 값(y): &quot;Segmentation&quot; (1,2,3,4)&lt;/li&gt;
&lt;li&gt;평가: Macro f1-score&lt;/li&gt;
&lt;li&gt;data: train.csv, test.csv&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;157&quot; data-origin-height=&quot;176&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/bUMnvF/btsla0rXAiU/BCKvyb0xjCzrqHkHktKZiK/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/bUMnvF/btsla0rXAiU/BCKvyb0xjCzrqHkHktKZiK/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/bUMnvF/btsla0rXAiU/BCKvyb0xjCzrqHkHktKZiK/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FbUMnvF%2Fbtsla0rXAiU%2FBCKvyb0xjCzrqHkHktKZiK%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;157&quot; height=&quot;176&quot; data-origin-width=&quot;157&quot; data-origin-height=&quot;176&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제1-1) 수치형 변수만 사용 (초급자)&lt;/b&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;1. EDA&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687543234460&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# 라이브러리 불러오기
import pandas as pd
train = pd.read_csv(&quot;train.csv&quot;)
test = pd.read_csv(&quot;test.csv&quot;)&lt;/code&gt;&lt;/pre&gt;
&lt;pre id=&quot;code_1687543269883&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;print(train.shape, test.shape)
train.head()

실행 결과 : 
(6665, 11) (2154, 10)&lt;/code&gt;&lt;/pre&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;1258&quot; data-origin-height=&quot;238&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/b9W5GJ/btsldPiVbLn/35wGqzfVG9we23trwEwYAk/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/b9W5GJ/btsldPiVbLn/35wGqzfVG9we23trwEwYAk/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/b9W5GJ/btsldPiVbLn/35wGqzfVG9we23trwEwYAk/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fb9W5GJ%2FbtsldPiVbLn%2F35wGqzfVG9we23trwEwYAk%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;1258&quot; height=&quot;238&quot; data-origin-width=&quot;1258&quot; data-origin-height=&quot;238&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 11개의 컬럼으로 이루어져있다. 'Segmentation' 컬럼은 Target 컬럼이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687543293580&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;train.info()
train.nunique()

실행 결과 :
&amp;lt;class 'pandas.core.frame.DataFrame'&amp;gt;
RangeIndex: 6665 entries, 0 to 6664
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               6665 non-null   int64  
 1   Gender           6665 non-null   object 
 2   Ever_Married     6665 non-null   object 
 3   Age              6665 non-null   int64  
 4   Graduated        6665 non-null   object 
 5   Profession       6665 non-null   object 
 6   Work_Experience  6665 non-null   float64
 7   Spending_Score   6665 non-null   object 
 8   Family_Size      6665 non-null   float64
 9   Var_1            6665 non-null   object 
 10  Segmentation     6665 non-null   int64  
dtypes: float64(2), int64(3), object(6)

ID                 6665
Gender                2
Ever_Married          2
Age                  67
Graduated             2
Profession            9
Work_Experience      15
Spending_Score        3
Family_Size           9
Var_1                 7
Segmentation          4
dtype: int64&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 수치형 컬럼은 5개가 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687543449359&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;train.isnull().sum()

실행 결과 :
ID                 0
Gender             0
Ever_Married       0
Age                0
Graduated          0
Profession         0
Work_Experience    0
Spending_Score     0
Family_Size        0
Var_1              0
Segmentation       0
dtype: int64&lt;/code&gt;&lt;/pre&gt;
&lt;pre id=&quot;code_1687543462335&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;test.isnull().sum()

실행 결과 : 
ID                 0
Gender             0
Ever_Married       0
Age                0
Graduated          0
Profession         0
Work_Experience    0
Spending_Score     0
Family_Size        0
Var_1              0
dtype: int64&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; train, test 데이터 모두 결측치는 없다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;2. 전처리&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687543577553&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# target(y, label) 값 복사
target = train.pop('Segmentation')
target

실행 결과 :
0       4
1       2
2       2
3       3
4       3
       ..
6660    2
6661    4
6662    4
6663    2
6664    2
Name: Segmentation, Length: 6665, dtype: int64&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; Target으로 사용할 'Segmentation' 컬럼은 train 데이터에서 pop() 함수를 사용해 따로 뽑아두고 데이터에서는 삭제한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687543635252&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# test데이터 ID 복사
test_ID = test.pop('ID')&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 제출용 데이터프레임에 사용할 'ID' 컬럼을 'test_ID' 변수에 담아 두고 데이터에서는 삭제한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687543697017&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# 수치형 컬럼(train)
# ['ID', 'Age', 'Work_Experience', 'Family_Size', 'Segmentation']
num_cols = ['Age', 'Work_Experience', 'Family_Size']
train = train[num_cols]&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 수치형 컬럼 5개 중 'ID' 컬럼은 사용하지 않아 미포함시키고, 'Segmentation' 컬럼은 Target으로 미포함시킨다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;3. model 학습 및 예측&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687543760197&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# 모델 선택 및 학습
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
rf.fit(train, target)
pred = rf.predict(test)
pred

실행 결과 : 
array([2, 3, 3, ..., 4, 3, 1])&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 랜덤포레스트 모델로 train 데이터와 target을 학습 시키고 test 데이터를 예측하면 1~4로 분류된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;4. csv파일 제출&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687543845629&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;pd.DataFrame({'ID': test_ID, 'Segmentation': pred}).to_csv('수험번호.csv', index=False)&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 문제에서 제시한 형식대로 'ID'와 'Segmentation' 컬럼에 test_ID와 pred값을 넣어 데이터프레임으로 만든 후 제출&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제1-2) 범주형(카테고리) 활용 (중급자)&lt;/b&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;1. EDA&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687543933197&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# 라이브러리 불러오기
import pandas as pd
train = pd.read_csv(&quot;train.csv&quot;)
test = pd.read_csv(&quot;test.csv&quot;)&lt;/code&gt;&lt;/pre&gt;
&lt;pre id=&quot;code_1687543953539&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;train.info()

실행 결과 : 
&amp;lt;class 'pandas.core.frame.DataFrame'&amp;gt;
RangeIndex: 6665 entries, 0 to 6664
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               6665 non-null   int64  
 1   Gender           6665 non-null   object 
 2   Ever_Married     6665 non-null   object 
 3   Age              6665 non-null   int64  
 4   Graduated        6665 non-null   object 
 5   Profession       6665 non-null   object 
 6   Work_Experience  6665 non-null   float64
 7   Spending_Score   6665 non-null   object 
 8   Family_Size      6665 non-null   float64
 9   Var_1            6665 non-null   object 
 10  Segmentation     6665 non-null   int64  
dtypes: float64(2), int64(3), object(6)&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 6개의 object형 컬럼이 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;2. 전처리&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687543988870&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# 원핫 인코딩
train = pd.get_dummies(train)
test = pd.get_dummies(test)&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; train, test 데이터를 get_dummies() 함수를 사용해 원핫 인코딩을 진행한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687544021287&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# target(y, label) 값 복사
target = train.pop('Segmentation')
target&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr;&amp;nbsp;Target으로 사용할 'Segmentation' 컬럼은 train 데이터에서 pop() 함수를 사용해 따로 뽑아두고 데이터에서는 삭제한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687544058552&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;train = train.drop(&quot;ID&quot;, axis=1)
test_ID = test.pop('ID')&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; train 데이터에서 사용하지 않는 'ID' 컬럼을 삭제한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 제출용 데이터프레임에 사용할 'ID' 컬럼을 'test_ID' 변수에 담아 두고 데이터에서는 삭제한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;3. model 학습 및 예측&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687544111560&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# 모델 선택 및 학습
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
rf.fit(train, target)
pred = rf.predict(test)
pred

실행 결과 :
array([1, 3, 3, ..., 2, 3, 4])&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 랜덤포레스트 모델로 train 데이터와 target을 학습 시키고 test 데이터를 예측하면 1~4로 분류된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;4. csv파일 제출&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687544155413&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;pd.DataFrame({'ID': test_ID, 'Segmentation': pred}).to_csv('수험번호.csv', index=False)&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 문제에서 제시한 형식대로 'ID'와 'Segmentation' 컬럼에 test_ID와 pred값을 넣어 데이터프레임으로 만든 후 제출&lt;/p&gt;</description>
      <category>빅데이터분석기사/작업형2</category>
      <author>seo0seok</author>
      <guid isPermaLink="true">https://bigdata0seok.tistory.com/37</guid>
      <comments>https://bigdata0seok.tistory.com/37#entry37comment</comments>
      <pubDate>Sat, 24 Jun 2023 03:17:11 +0900</pubDate>
    </item>
    <item>
      <title>[작업형1] 조건 / 평균 / bmi / 절대값 / 내림차순 구하기</title>
      <link>https://bigdata0seok.tistory.com/36</link>
      <description>&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-filename=&quot;[꾸미기]빅데이터분석기사.jpg&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/m123C/btslaghMoFM/urgREe4k4fIto2ATRWBdM1/img.jpg&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/m123C/btslaghMoFM/urgREe4k4fIto2ATRWBdM1/img.jpg&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/m123C/btslaghMoFM/urgREe4k4fIto2ATRWBdM1/img.jpg&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fm123C%2FbtslaghMoFM%2FurgREe4k4fIto2ATRWBdM1%2Fimg.jpg&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;366&quot; height=&quot;223&quot; data-filename=&quot;[꾸미기]빅데이터분석기사.jpg&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p style=&quot;text-align: center;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #ee2323;&quot;&gt;&lt;b&gt;// 퇴근후딴짓 님의 강의를 참고하였습니다. //&lt;/b&gt;&lt;/span&gt;&lt;/p&gt;
&lt;p style=&quot;color: #555555; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #555555; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #555555; text-align: start;&quot;&gt;Dataset :&lt;/span&gt;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;fileblock&quot; data-ke-align=&quot;alignCenter&quot;&gt;&lt;a href=&quot;https://blog.kakaocdn.net/dn/ddItGg/btsk83p2LOI/pVuLZNdaKUBNgRE4tkl4K0/5-1price.csv?attach=1&amp;amp;knm=tfile.csv&quot; class=&quot;&quot;&gt;
    &lt;div class=&quot;image&quot;&gt;&lt;/div&gt;
    &lt;div class=&quot;desc&quot;&gt;&lt;div class=&quot;filename&quot;&gt;&lt;span class=&quot;name&quot;&gt;5-1price.csv&lt;/span&gt;&lt;/div&gt;
&lt;div class=&quot;size&quot;&gt;0.14MB&lt;/div&gt;
&lt;/div&gt;
  &lt;/a&gt;&lt;/figure&gt;
&lt;figure class=&quot;fileblock&quot; data-ke-align=&quot;alignCenter&quot;&gt;&lt;a href=&quot;https://blog.kakaocdn.net/dn/L3fk7/btslboMXR7e/RHgSLwfziseAGHF5qLysVK/5-2bmi.csv?attach=1&amp;amp;knm=tfile.csv&quot; class=&quot;&quot;&gt;
    &lt;div class=&quot;image&quot;&gt;&lt;/div&gt;
    &lt;div class=&quot;desc&quot;&gt;&lt;div class=&quot;filename&quot;&gt;&lt;span class=&quot;name&quot;&gt;5-2bmi.csv&lt;/span&gt;&lt;/div&gt;
&lt;div class=&quot;size&quot;&gt;0.16MB&lt;/div&gt;
&lt;/div&gt;
  &lt;/a&gt;&lt;/figure&gt;
&lt;figure class=&quot;fileblock&quot; data-ke-align=&quot;alignCenter&quot;&gt;&lt;a href=&quot;https://blog.kakaocdn.net/dn/dMGN7B/btslcDJFoMp/JPtr3GuBk9vQB8lirX83Tk/5-3student.csv?attach=1&amp;amp;knm=tfile.csv&quot; class=&quot;&quot;&gt;
    &lt;div class=&quot;image&quot;&gt;&lt;/div&gt;
    &lt;div class=&quot;desc&quot;&gt;&lt;div class=&quot;filename&quot;&gt;&lt;span class=&quot;name&quot;&gt;5-3student.csv&lt;/span&gt;&lt;/div&gt;
&lt;div class=&quot;size&quot;&gt;1.44MB&lt;/div&gt;
&lt;/div&gt;
  &lt;/a&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제1) 종량제 봉투 종류가 '규격봉투'이고, 종량제 봉투 용도가 '음식물쓰레기'인 2L가격 평균을 출력하시오.&lt;/b&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;(단, 가격0 제외, 반올림 후 정수 출력)&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687535870383&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;import pandas as pd
df = pd.read_csv(&quot;5-1price.csv&quot;)
df&lt;/code&gt;&lt;/pre&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;1930&quot; data-origin-height=&quot;753&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/Wsf9E/btslaeYEJtp/JWRhL4xziNcZLgDkxIR4Y0/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/Wsf9E/btslaeYEJtp/JWRhL4xziNcZLgDkxIR4Y0/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/Wsf9E/btslaeYEJtp/JWRhL4xziNcZLgDkxIR4Y0/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FWsf9E%2FbtslaeYEJtp%2FJWRhL4xziNcZLgDkxIR4Y0%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;1930&quot; height=&quot;753&quot; data-origin-width=&quot;1930&quot; data-origin-height=&quot;753&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687536928538&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;cond1 = df['2ℓ가격'] != 0
cond2 = df['종량제봉투종류'] == &quot;규격봉투&quot;
cond3 = df['종량제봉투용도'] == &quot;음식물쓰레기&quot;
df = df[cond1 &amp;amp; cond2 &amp;amp; cond3]

print(round(df['2ℓ가격'].mean()))

실행 결과 : 
118&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; '2L가격'이 '0원'이 아닌 조건1 / '종량제봉투종류'가 '규격봉투'인 조건2 / '종량제봉투용도'가 '음식물쓰레기인 조건3&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; 세 가지 조건을 만든다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 세 가지 조건을 모두 만족하는 '2L가격' 컬럼의 평균을 mean() 함수로 구하고 round() 함수로 반올림하여 출력한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제2)&lt;span&gt; &lt;span style=&quot;background-color: #ffffff; color: #212121; text-align: left;&quot;&gt;bmi를 계산하고, 수치가 정상인 사람 수와 위험체중인 사람 수의 차이를 절대값으로 구하시오. (정수로 출력)&lt;/span&gt;&lt;/span&gt;&lt;/b&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; - bmi(체질량지수): 몸무게(kg) / 키(m)의 제곱 단위 -&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc; background-color: #ffffff; color: #212121; text-align: start;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;Height: cm&lt;/li&gt;
&lt;li&gt;Weight: kg&lt;/li&gt;
&lt;/ul&gt;
&lt;ul style=&quot;list-style-type: disc; background-color: #ffffff; color: #212121; text-align: start;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;저체중: BMI 18.5미만&lt;/li&gt;
&lt;li&gt;정상체중: BMI 18.5이상 ~ 23미만&lt;/li&gt;
&lt;li&gt;과체중 또는 위험체중: BMI 23 이상 ~ 25미만&lt;/li&gt;
&lt;li&gt;비만체중: 25이상&lt;/li&gt;
&lt;/ul&gt;
&lt;pre id=&quot;code_1687535917982&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;import pandas as pd
df = pd.read_csv(&quot;5-2bmi.csv&quot;)
df&lt;/code&gt;&lt;/pre&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;306&quot; data-origin-height=&quot;517&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/zIKXH/btsk83p2RI5/HklZ98E9DpZBw4gqtg69b0/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/zIKXH/btsk83p2RI5/HklZ98E9DpZBw4gqtg69b0/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/zIKXH/btsk83p2RI5/HklZ98E9DpZBw4gqtg69b0/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FzIKXH%2Fbtsk83p2RI5%2FHklZ98E9DpZBw4gqtg69b0%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;306&quot; height=&quot;517&quot; data-origin-width=&quot;306&quot; data-origin-height=&quot;517&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687537113437&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;df['bmi'] = df['Weight'] / (df['Height']/100)**2
cond1 = (df['bmi'] &amp;gt;= 18.5) &amp;amp; (df['bmi'] &amp;lt; 23)
cond2 = (df['bmi'] &amp;gt;= 23) &amp;amp; (df['bmi'] &amp;lt; 25)
print(abs(len(df[cond1]) - len(df[cond2])))

실행 결과 :
144&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; Weight와 Height를 이용해 bmi를 구한다. (키는 cm로 되어있어 100으로 나눠 m로 만들어 줘야한다.)&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; bmi가 정상인 조건1 / bmi가 과체중인 조건2를 만든다.&lt;/p&gt;
&lt;p style=&quot;color: #333333; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 정상인 사람 수와 과체중인 사람의 수를 len() 함수를 사용해 구하고 뺀 후 abs() 함수를 사용해 절대값을 씌워 출력한다.&lt;/p&gt;
&lt;p style=&quot;color: #333333; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제3) &lt;span&gt;&lt;/span&gt;순전입학생(순전입 학생 = 전입 학생 - 전출 학생)이 가장 많은 학교의 전체학생수를 정수로 출력하시오.&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687535936240&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;import pandas as pd
df = pd.read_csv(&quot;5-3student.csv&quot;)
df&lt;/code&gt;&lt;/pre&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;1947&quot; data-origin-height=&quot;978&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/nE3r2/btslbwjPvc7/JvjnWtklbblFN9hmdBDrq1/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/nE3r2/btslbwjPvc7/JvjnWtklbblFN9hmdBDrq1/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/nE3r2/btslbwjPvc7/JvjnWtklbblFN9hmdBDrq1/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FnE3r2%2FbtslbwjPvc7%2FJvjnWtklbblFN9hmdBDrq1%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;1947&quot; height=&quot;978&quot; data-origin-width=&quot;1947&quot; data-origin-height=&quot;978&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687537308829&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;df['순전입'] = df['전입학생수(계)'] - df['전출학생수(계)']
df = df.sort_values('순전입', ascending=False)
print(int(df.iloc[0,-2]))

실행 결과 :
230&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; '전입학생수'와 '전출학생수'를 이용해 '순전입'을 구한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; '순전입' 컬럼을 sort_values(ascending=False) 함수를 사용해 내림차순으로 정렬한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; iloc() 함수를 통해 첫 번째 행의 '전체학생수' 컬럼을 선택한 후 int를 씌워 출력하면 230명이 나온다.&lt;/p&gt;</description>
      <category>빅데이터분석기사/작업형1</category>
      <author>seo0seok</author>
      <guid isPermaLink="true">https://bigdata0seok.tistory.com/36</guid>
      <comments>https://bigdata0seok.tistory.com/36#entry36comment</comments>
      <pubDate>Sat, 24 Jun 2023 01:25:21 +0900</pubDate>
    </item>
    <item>
      <title>[작업형1] 사분위수 / 절대값 / 조건 / datetime 변환 구하기</title>
      <link>https://bigdata0seok.tistory.com/35</link>
      <description>&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-filename=&quot;[꾸미기]빅데이터분석기사.jpg&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/BQMlL/btsk9DYHZcJ/1YbKTkYv39rZ1LTrog1a20/img.jpg&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/BQMlL/btsk9DYHZcJ/1YbKTkYv39rZ1LTrog1a20/img.jpg&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/BQMlL/btsk9DYHZcJ/1YbKTkYv39rZ1LTrog1a20/img.jpg&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FBQMlL%2Fbtsk9DYHZcJ%2F1YbKTkYv39rZ1LTrog1a20%2Fimg.jpg&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;366&quot; height=&quot;223&quot; data-filename=&quot;[꾸미기]빅데이터분석기사.jpg&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p style=&quot;text-align: center;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #ee2323;&quot;&gt;&lt;b&gt;// 퇴근후딴짓 님의 강의를 참고하였습니다. //&lt;/b&gt;&lt;/span&gt;&lt;/p&gt;
&lt;p style=&quot;color: #555555; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #555555; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #555555; text-align: start;&quot;&gt;Dataset :&lt;/span&gt;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;fileblock&quot; data-ke-align=&quot;alignCenter&quot;&gt;&lt;a href=&quot;https://blog.kakaocdn.net/dn/c8tsbw/btslbQiaftk/x7KRgWIUolwzjlUcOlhXT1/basic1.csv?attach=1&amp;amp;knm=tfile.csv&quot; class=&quot;&quot;&gt;
    &lt;div class=&quot;image&quot;&gt;&lt;/div&gt;
    &lt;div class=&quot;desc&quot;&gt;&lt;div class=&quot;filename&quot;&gt;&lt;span class=&quot;name&quot;&gt;basic1.csv&lt;/span&gt;&lt;/div&gt;
&lt;div class=&quot;size&quot;&gt;0.00MB&lt;/div&gt;
&lt;/div&gt;
  &lt;/a&gt;&lt;/figure&gt;
&lt;figure class=&quot;fileblock&quot; data-ke-align=&quot;alignCenter&quot;&gt;&lt;a href=&quot;https://blog.kakaocdn.net/dn/4sXXN/btsk9kLJVDt/nw5UpkmYrNyjb0zflBmBQ1/fb.csv?attach=1&amp;amp;knm=tfile.csv&quot; class=&quot;&quot;&gt;
    &lt;div class=&quot;image&quot;&gt;&lt;/div&gt;
    &lt;div class=&quot;desc&quot;&gt;&lt;div class=&quot;filename&quot;&gt;&lt;span class=&quot;name&quot;&gt;fb.csv&lt;/span&gt;&lt;/div&gt;
&lt;div class=&quot;size&quot;&gt;0.23MB&lt;/div&gt;
&lt;/div&gt;
  &lt;/a&gt;&lt;/figure&gt;
&lt;figure class=&quot;fileblock&quot; data-ke-align=&quot;alignCenter&quot;&gt;&lt;a href=&quot;https://blog.kakaocdn.net/dn/mUc7W/btslaJDXEB4/jVezunIIZXZ34MkRReubmk/nf.csv?attach=1&amp;amp;knm=tfile.csv&quot; class=&quot;&quot;&gt;
    &lt;div class=&quot;image&quot;&gt;&lt;/div&gt;
    &lt;div class=&quot;desc&quot;&gt;&lt;div class=&quot;filename&quot;&gt;&lt;span class=&quot;name&quot;&gt;nf.csv&lt;/span&gt;&lt;/div&gt;
&lt;div class=&quot;size&quot;&gt;2.02MB&lt;/div&gt;
&lt;/div&gt;
  &lt;/a&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제1) &lt;span style=&quot;background-color: #ffffff; color: #212121; text-align: start;&quot;&gt;age 컬럼의 3사분위수와 1사분위수의 차를 절대값으로 구하고, 소수점 버려서, 정수로 출력하시오.&lt;/span&gt;&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687534595857&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;import pandas as pd
df = pd.read_csv(&quot;basic1.csv&quot;)
df&lt;/code&gt;&lt;/pre&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;528&quot; data-origin-height=&quot;508&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/U3Pct/btsk84PYTU7/2MoMx0olh43hsSJMAOBk4k/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/U3Pct/btsk84PYTU7/2MoMx0olh43hsSJMAOBk4k/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/U3Pct/btsk84PYTU7/2MoMx0olh43hsSJMAOBk4k/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FU3Pct%2Fbtsk84PYTU7%2F2MoMx0olh43hsSJMAOBk4k%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;528&quot; height=&quot;508&quot; data-origin-width=&quot;528&quot; data-origin-height=&quot;508&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687534650290&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;result = abs(df['age'].quantile(0.25) - df['age'].quantile(0.75))
print(int(result))

실행 결과 : 
50&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; quantile() 함수를 사용해 3분위수와 1분위수를 구한 후 abs() 함수를 사용해 절대값을 씌워준다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 정수로 출력하기 위해 int() 함수를 사용해 출력하면 50이 나온다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제2) &lt;span style=&quot;background-color: #ffffff; color: #212121; text-align: start;&quot;&gt;(loves반응+wows반응)/(reactions반응) 비율이 0.4보다 크고 0.5보다 작으면서, status_type=='video'인 데이터의 갯수를 구하시오.&lt;/span&gt;&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687534716491&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;import pandas as pd
df = pd.read_csv(&quot;fb.csv&quot;)
df&lt;/code&gt;&lt;/pre&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;861&quot; data-origin-height=&quot;513&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/evkLTF/btsk9cAlhTa/84GqYiT2FmkW0rgjnN2mGK/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/evkLTF/btsk9cAlhTa/84GqYiT2FmkW0rgjnN2mGK/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/evkLTF/btsk9cAlhTa/84GqYiT2FmkW0rgjnN2mGK/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FevkLTF%2Fbtsk9cAlhTa%2F84GqYiT2FmkW0rgjnN2mGK%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;861&quot; height=&quot;513&quot; data-origin-width=&quot;861&quot; data-origin-height=&quot;513&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687534755730&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;cond1 = (df['loves'] + df['wows'])/ df['reactions'] &amp;gt; 0.4
cond2 = (df['loves'] + df['wows'])/ df['reactions'] &amp;lt; 0.5
cond3 = df['type'] == 'video'

print(len(df[cond1 &amp;amp; cond2 &amp;amp; cond3]))

실행 결과 : 
90&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 비율이 0.4보다 큰 조건1 / 비율이 0.5보다 작은 조건2 / 'type' 컬럼이 'video'인 조건3 세 가지를 만든다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 조건 세 가지를 모두 만족하는 개수를 len()함수를 사용하여 출력하면 90개이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제3) &lt;span style=&quot;background-color: #ffffff; color: #212121; text-align: start;&quot;&gt;date_added가 2018년 1월 이면서 country가 United Kingdom 단독 제작인 데이터의 갯수를 구하시오.&lt;/span&gt;&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687534851264&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;import pandas as pd
df = pd.read_csv(&quot;nf.csv&quot;)
df&lt;/code&gt;&lt;/pre&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;2050&quot; data-origin-height=&quot;510&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/KqTaz/btsk81MAktM/k90w9T45VxVf1H7JZp73Kk/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/KqTaz/btsk81MAktM/k90w9T45VxVf1H7JZp73Kk/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/KqTaz/btsk81MAktM/k90w9T45VxVf1H7JZp73Kk/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FKqTaz%2Fbtsk81MAktM%2Fk90w9T45VxVf1H7JZp73Kk%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;2050&quot; height=&quot;510&quot; data-origin-width=&quot;2050&quot; data-origin-height=&quot;510&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687534914867&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# 풀이1
cond1 = df['country'] == &quot;United Kingdom&quot;

df['date_added'] = pd.to_datetime(df['date_added'])
df['year'] = df['date_added'].dt.year
df['month'] = df['date_added'].dt.month


cond2 = df['year'] == 2018
cond3 = df['month'] == 1

print(len(df[cond1 &amp;amp; cond2 &amp;amp; cond3]))

실행 결과 : 
6&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 'object'형인 'country' 컬럼을 pd.to_datetime() 함수를 사용해 변환해준다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; dt.year / dt.month 함수를 사용해 년 / 월 컬럼을 만들어준다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 'country' 컬럼이 'United Kingdom&quot;인 조건1을 만든다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 년도가 2018년인 조건2, 월이 1월인 조건3을 만든다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 조건 3가지를 모두 만족하는 개수를 len함수를 사용하여 출력하면 6개이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687534932435&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# 풀이2
cond1 = df['country'] == &quot;United Kingdom&quot;

df['date_added'] = pd.to_datetime(df['date_added'])

cond2 = df['date_added'] &amp;gt;= '2018-1-1'
cond3 = df['date_added'] &amp;lt;= '2018-1-31'

print(len(df[cond1 &amp;amp; cond2 &amp;amp; cond3]))

실행 결과 : 
6&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 'object'형인 'country' 컬럼을 pd.to_datetime() 함수를 사용해 변환해준다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 조건을 부등호를 사용해 '2018-1-1' ~ '2018-1-31' 인 날짜를 만든다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687534949803&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# 풀이3

cond1 = df['country'] == &quot;United Kingdom&quot;
df['date_added'] = pd.to_datetime(df['date_added'])
cond2 = df['date_added'].between('2018-1-1', '2018-1-31')
print(len(df[cond1 &amp;amp; cond2]))

실행 결과 : 6&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 'object'형인 'country' 컬럼을 pd.to_datetime() 함수를 사용해 변환해준다.&lt;/p&gt;
&lt;p style=&quot;color: #333333; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 조건을 between() 함수를를 사용해 '2018-1-1' ~ '2018-1-31' 인 날짜를 만든다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687534973827&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# 풀이4
cond1 = df['country'] == &quot;United Kingdom&quot;

df['date_added'] = df['date_added'].fillna(&quot;&quot;)

str1 = &quot;2018&quot;
str2 = &quot;January&quot;
cond2 = df['date_added'].str.contains(str1)
cond3 = df['date_added'].str.contains(str2)

print(len(df[cond1 &amp;amp; cond2 &amp;amp; cond3]))

실행 결과 : 6&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 'object'형인 'country' 컬럼을 pd.to_datetime() 함수를 사용해 변환해준다.&lt;/p&gt;
&lt;p style=&quot;color: #333333; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 조건을 str.contains()함수를 사용해 '2018년'과 'January(1월)'이 포함된 날짜를 만든다.&lt;/p&gt;
&lt;p style=&quot;color: #333333; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;추가 문제)&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;span style=&quot;background-color: #ffffff; color: #212121; text-align: start;&quot;&gt;&lt;span style=&quot;background-color: #ffffff; color: #212121; text-align: start;&quot;&gt;만약 'country'컬럼에 대소문자 함께 있고, 띄어쓰기가 있는 것도 있고 없는 것도 있다면?&lt;/span&gt;&lt;/span&gt;&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687535021842&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# 띄어쓰기 제거
df['country'] = df['country'].str.replace(' ','')

# 소문자로 변경
df['country'] = df['country'].str.lower()
df['country']

cond1 = df['country'] == &quot;unitedkingdom&quot;

df['date_added'] = pd.to_datetime(df['date_added'])
df['year'] = df['date_added'].dt.year
df['month'] = df['date_added'].dt.month


cond2 = df['year'] == 2018
cond3 = df['month'] == 1

print(len(df[cond1 &amp;amp; cond2 &amp;amp; cond3]))

실행 결과 : 
6&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 'country' 컬럼에 띄어쓰기가 있다면 str.replace(' ', '') 함수를 사용해 띄어쓰기를 제거 해준다.&lt;/p&gt;
&lt;p style=&quot;color: #333333; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 'country' 컬럼에 대소문자가 섞여있어 소문자로 변경하고 싶다면 str.lower() 함수를 사용하면 된다.&lt;/p&gt;
&lt;p style=&quot;color: #333333; text-align: start;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp;(소문자 -&amp;gt; 대문자 : str.upper() 함수 사용)&lt;/p&gt;</description>
      <category>빅데이터분석기사/작업형1</category>
      <author>seo0seok</author>
      <guid isPermaLink="true">https://bigdata0seok.tistory.com/35</guid>
      <comments>https://bigdata0seok.tistory.com/35#entry35comment</comments>
      <pubDate>Sat, 24 Jun 2023 00:53:57 +0900</pubDate>
    </item>
    <item>
      <title>[작업형3] 회귀모형</title>
      <link>https://bigdata0seok.tistory.com/34</link>
      <description>&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/7C18K/btskZsce2lw/4tS2pbUgPKC6wLBSoPrgE1/img.jpg&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/7C18K/btskZsce2lw/4tS2pbUgPKC6wLBSoPrgE1/img.jpg&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/7C18K/btskZsce2lw/4tS2pbUgPKC6wLBSoPrgE1/img.jpg&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2F7C18K%2FbtskZsce2lw%2F4tS2pbUgPKC6wLBSoPrgE1%2Fimg.jpg&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;366&quot; height=&quot;223&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p style=&quot;text-align: center;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #ee2323;&quot;&gt;&lt;b&gt;// 퇴근후딴짓 님의 강의를 참고하였습니다. //&lt;/b&gt;&lt;/span&gt;&lt;/p&gt;
&lt;p style=&quot;text-align: center;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;text-align: center;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #000000;&quot;&gt;&lt;b&gt;1. 상관계수&lt;/b&gt;&lt;/span&gt;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;- 두 변수 사이의 관계 (1에 가까울 수록 강한 양의 상관관계, -1에 가까울 수록 강한 음의 상관관계)&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제1) iris에서 Sepal Length와 Sepal Width의 상관계수 계산하고 반올림 후 소수 둘째자리까지 출력하고, 양의 상관관계인지 음의 상관관계인지 출력하시오.&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687458738573&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;import pandas as pd
from sklearn.datasets import load_iris

# iris 데이터셋 로드
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df&lt;/code&gt;&lt;/pre&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;799&quot; data-origin-height=&quot;513&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/BESvo/btsk27xuNlO/0ROpq0GHKWCchkOAVK5LU1/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/BESvo/btsk27xuNlO/0ROpq0GHKWCchkOAVK5LU1/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/BESvo/btsk27xuNlO/0ROpq0GHKWCchkOAVK5LU1/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FBESvo%2Fbtsk27xuNlO%2F0ROpq0GHKWCchkOAVK5LU1%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;799&quot; height=&quot;513&quot; data-origin-width=&quot;799&quot; data-origin-height=&quot;513&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687458801565&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# Sepal Length와 Sepal Width의 상관계수 계산
correlation = df.corr()
result = correlation.loc['sepal length (cm)', 'sepal width (cm)']
print(round(result,2))

실행 결과 : 
-0.12&lt;/code&gt;&lt;/pre&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;907&quot; data-origin-height=&quot;215&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/bEwX9h/btsk4NlfdC0/3KbeggmIQgfyE7bKwix2MK/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/bEwX9h/btsk4NlfdC0/3KbeggmIQgfyE7bKwix2MK/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/bEwX9h/btsk4NlfdC0/3KbeggmIQgfyE7bKwix2MK/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FbEwX9h%2Fbtsk4NlfdC0%2F3KbeggmIQgfyE7bKwix2MK%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;907&quot; height=&quot;215&quot; data-origin-width=&quot;907&quot; data-origin-height=&quot;215&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;rarr; corr() 함수를 통해 상관계수를 구한다.&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 문제의 'sepal length (cm)', 'sepal width (cm)' 의 상관계수를 loc()함수를 사용해 직접 뽑아낸다.&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;rarr; -0.12로 약한 음의 상관관계를 나타내는 것을 알 수 있다.&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제2) iris 데이터에서 sepal length (cm)[독립변수]를 통해 sepal width (cm)[종속변수]를 예측하려고 한다. &lt;/b&gt;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;단순선형 회귀모형을 만들고 기울기와 절편을 구하시오.&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687458984011&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(df[['sepal length (cm)']], df['sepal width (cm)'])

print(lr.coef_[0]) # 기울기
print(lr.intercept_) # 절편

실행 결과 :
-0.06188479796414415
3.418946836103816&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 먼저 단순선형 회귀모형을 만든 후 fit()함수 안에 독립변수 'sepal length (cm)', 종속변수 'sepal width (cm)'를 넣어준다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 기울기는 .coef_[0]을 통해 구하면 -0.06 / 절편은 .intercept_를 통해 구하면 3.41이 나오게 된다.&lt;/p&gt;</description>
      <category>빅데이터분석기사/작업형3</category>
      <author>seo0seok</author>
      <guid isPermaLink="true">https://bigdata0seok.tistory.com/34</guid>
      <comments>https://bigdata0seok.tistory.com/34#entry34comment</comments>
      <pubDate>Fri, 23 Jun 2023 03:39:08 +0900</pubDate>
    </item>
    <item>
      <title>[작업형3] 일원배치법</title>
      <link>https://bigdata0seok.tistory.com/33</link>
      <description>&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/cmFqY2/btsk4ORZrpo/6j0u4lKq1Eokl9I5Qk9uz1/img.jpg&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/cmFqY2/btsk4ORZrpo/6j0u4lKq1Eokl9I5Qk9uz1/img.jpg&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/cmFqY2/btsk4ORZrpo/6j0u4lKq1Eokl9I5Qk9uz1/img.jpg&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FcmFqY2%2Fbtsk4ORZrpo%2F6j0u4lKq1Eokl9I5Qk9uz1%2Fimg.jpg&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;366&quot; height=&quot;223&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p style=&quot;text-align: center;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #ee2323;&quot;&gt;&lt;b&gt;// 퇴근후딴짓 님의 강의를 참고하였습니다. //&lt;/b&gt;&lt;/span&gt;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: center;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;1. 일원배치법&lt;/b&gt;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;- 관측치가 3개 이상일 때 활용&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc; background-color: #ffffff; color: #212121; text-align: start;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;그룹 간의 평균 차이를 비교하기 위해 사용되는 통계적 검정&lt;/li&gt;
&lt;li&gt;한개의 변수(그룹)에 따라 평균의 차이가 통계적으로 유의미한지 검정&lt;/li&gt;
&lt;li&gt;&lt;span style=&quot;color: #ee2323;&quot;&gt;양측 검정만 있음 (방향이 없음)&lt;/span&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제) 빅데이터 분석기사 실기를 세 가지 다른 교육 방법(A, B, C)을 도입하여 수험생들의 실기시험 성적을 비교하고자 합니다. 40명의 학생들을 무작위로 12명씩 세 그룹으로 나누어 교육을 시행한 후, 시험을 실시하였습니다. 다음은 각 그룹의 수험생들의 실기시험 성적 데이터입니다. (&lt;span style=&quot;background-color: #ffffff; color: #212121; text-align: start;&quot;&gt;단, 각 그룹의 데이터는 정규성을 만족하고 그룹간의 등분산성은 동일하다.)&lt;/span&gt;&lt;/b&gt;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;316&quot; data-origin-height=&quot;518&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/DcOn4/btsk1BMDpJy/tcgsQHaETazc9lierx0Hf1/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/DcOn4/btsk1BMDpJy/tcgsQHaETazc9lierx0Hf1/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/DcOn4/btsk1BMDpJy/tcgsQHaETazc9lierx0Hf1/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FDcOn4%2Fbtsk1BMDpJy%2FtcgsQHaETazc9lierx0Hf1%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;316&quot; height=&quot;518&quot; data-origin-width=&quot;316&quot; data-origin-height=&quot;518&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;- 귀무가설(H0) : &lt;span style=&quot;background-color: #ffffff; color: #212121; text-align: left;&quot;&gt;세 그룹 간의 평균 실기시험 성적 차이가 없다. (모평균의 차이가 없다)&lt;/span&gt;.&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;- 대립가설(H1) : &lt;span style=&quot;background-color: #ffffff; color: #212121; text-align: left;&quot;&gt;세 그룹 간의 평균 실기시험 성적 차이가 있다. (모평균이 모두 같지는 않다)&lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687457860222&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# 데이터 생성 (실행)
import pandas as pd
group_A = [78, 85, 92, 88, 76, 81, 80, 79, 83, 89, 91, 87]
group_B = [77, 74, 84, 82, 79, 80, 85, 88, 81, 76, 78, 83]
group_C = [79, 78, 72, 75, 74, 76, 73, 68, 71, 75, 79, 72]

pd.DataFrame({
    'group_A':group_A,
    'group_B':group_B,
    'group_C':group_C
}).to_csv(&quot;oneway.csv&quot;, index=False)&lt;/code&gt;&lt;/pre&gt;
&lt;pre id=&quot;code_1687457866842&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;import pandas as pd
import scipy.stats as stats
df = pd.read_csv('oneway.csv')&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;정규성 검정)&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687457846048&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# [정규성 검정] shapiro (H0: 정규분포를 따른다. H1: 정규분포를 따르지 않는다.)
# 주어진 데이터가 정규 분포를 따르는지를 확인
print(stats.shapiro(df['group_A']))
print(stats.shapiro(df['group_B']))
print(stats.shapiro(df['group_C']))

실행 결과 :
ShapiroResult(statistic=0.9519118666648865, pvalue=0.6650832295417786)
ShapiroResult(statistic=0.9926441311836243, pvalue=0.9999779462814331)
ShapiroResult(statistic=0.9600766897201538, pvalue=0.7849239110946655)&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 세 컬럼 모두 p-value값이 0.05보다 커 귀무가설을 채택한다. 즉, 정규분포를 따른다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;등분산 검정)&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687457965769&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# [등분산 검정] levene (H0: 각 그룹 데이터는 등분산을 가진다. H1: 하나 이상의 그룹이 등분산을 가지지 않는다.)
# 그룹 간 분산이 동일한지를 확인하기 위한 검정
print(stats.levene(df['group_A'], df['group_B'], df['group_C']))

실행 결과 :
LeveneResult(statistic=2.437300743889479, pvalue=0.10297619038422344)&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 세 컬럼의 p-value값이 0.05보다 커 귀무가설을 채택한다. 즉, 등분산이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제1) F값 (반올림하여 소수 둘째자리까지 계산)&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687458015498&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# 일원배치법 수행
f_statistic, p_value = stats.f_oneway(df['group_A'], df['group_B'], df['group_C'])&lt;/code&gt;&lt;/pre&gt;
&lt;pre id=&quot;code_1687458043356&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# F-value
print(round(f_statistic,2))

실행 결과 : 
15.57&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; stats.f_oneway 함수에 세 컬럼을 넣어 f_statistic을 출력하면 15.17이 나온다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제2) P값 (반올림하여 소수 여섯째자리까지 계산)&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687458060363&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# p-value
print(format(p_value,'.6f'))

실행 결과 : 0.000017&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; stats.f_oneway 함수에 세 컬럼을 넣어 p-value를 출력하면 0.000017이 나온다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제3) 검정수준(유의수준 0.05하)&lt;/b&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #000000;&quot;&gt;&amp;rarr; pvalue값이 0.000017로 유의수준 0.05보다 낮아 귀무가설을 기각한다.&lt;/span&gt;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #000000;&quot;&gt;&amp;rarr; 즉, 대립가설이 채택되므로 교육방법에 따라 시험결과의 차이가 있다고 할 수 있다.&lt;/span&gt;&lt;/p&gt;</description>
      <category>빅데이터분석기사/작업형3</category>
      <author>seo0seok</author>
      <guid isPermaLink="true">https://bigdata0seok.tistory.com/33</guid>
      <comments>https://bigdata0seok.tistory.com/33#entry33comment</comments>
      <pubDate>Fri, 23 Jun 2023 03:23:41 +0900</pubDate>
    </item>
    <item>
      <title>[작업형3] 독립성 검정(카이제곱)</title>
      <link>https://bigdata0seok.tistory.com/32</link>
      <description>&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/ojJR6/btsk1BskIFM/mvMKwGpPZdT6iz9SPAcsKK/img.jpg&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/ojJR6/btsk1BskIFM/mvMKwGpPZdT6iz9SPAcsKK/img.jpg&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/ojJR6/btsk1BskIFM/mvMKwGpPZdT6iz9SPAcsKK/img.jpg&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FojJR6%2Fbtsk1BskIFM%2FmvMKwGpPZdT6iz9SPAcsKK%2Fimg.jpg&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;366&quot; height=&quot;223&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;223&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p style=&quot;text-align: center;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #ee2323;&quot;&gt;&lt;b&gt;// 퇴근후딴짓 님의 강의를 참고하였습니다. //&lt;/b&gt;&lt;/span&gt;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: center;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;1. 독립성 검정 &lt;/b&gt;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;- 두 범주형 변수의 관련성&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제) A공장과 B공장 임직원 간의 근무기간 비율에 차이가 있는가?&lt;/b&gt;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;594&quot; data-origin-height=&quot;389&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/cwb1Hw/btskZsJ55sf/pVnVAkkIkoKOZzKS26m350/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/cwb1Hw/btskZsJ55sf/pVnVAkkIkoKOZzKS26m350/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/cwb1Hw/btskZsJ55sf/pVnVAkkIkoKOZzKS26m350/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fcwb1Hw%2FbtskZsJ55sf%2FpVnVAkkIkoKOZzKS26m350%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;594&quot; height=&quot;389&quot; data-origin-width=&quot;594&quot; data-origin-height=&quot;389&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;- 귀무가설(H0) : '근무지'와 '근무기간'은 독립이다.&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;- 대립가설(H1) : '근무지'와 '근무기간'은 독립이 아니다.&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제1) 위의 가설에 대한 독립성 검정(카이제곱)의 검정통계량 값은?&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687456360794&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;import pandas as pd
from scipy.stats import chi2_contingency&lt;/code&gt;&lt;/pre&gt;
&lt;pre id=&quot;code_1687456368794&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# 데이터
df = pd.DataFrame([[50, 60],[25, 40]])&lt;/code&gt;&lt;/pre&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;107&quot; data-origin-height=&quot;105&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/cqYjBM/btsk0dZQ0QW/nWyKWf5aTDg93MOWKvCbyK/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/cqYjBM/btsk0dZQ0QW/nWyKWf5aTDg93MOWKvCbyK/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/cqYjBM/btsk0dZQ0QW/nWyKWf5aTDg93MOWKvCbyK/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FcqYjBM%2Fbtsk0dZQ0QW%2FnWyKWf5aTDg93MOWKvCbyK%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;107&quot; height=&quot;105&quot; data-origin-width=&quot;107&quot; data-origin-height=&quot;105&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 문제와 같이 데이터프레임을 만들어준다.&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687456517904&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# 독립성 검정
stat, p, dof, expected_freq = chi2_contingency(df)

실행 결과 : 
Chi2ContingencyResult(statistic=0.5552884615384607, pvalue=0.4561648467028253,
dof=1, expected_freq=array([[47.14285714, 62.85714286], [27.85714286, 37.14285714]]))&lt;/code&gt;&lt;/pre&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;rarr; chi2_contigency 실행 결과 4개의 값이 나오는데 4개의 변수에 바로 담아줄 수 있다.&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687456595268&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# 검정통계량
print(round(stat,2))

실행 결과 : 
0.56&lt;/code&gt;&lt;/pre&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;rarr; 검정통계량은 0.56이다.&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제2) 위의 통계량에 대한 p-값을 구하시오.&lt;/b&gt;&lt;/p&gt;
&lt;pre id=&quot;code_1687456626504&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# p-value
print(round(p,4))

실행 결과 : 
0.4562&lt;/code&gt;&lt;/pre&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;rarr; p-value값은 0.4562이다.&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;문제3) 유의수준 0.05하에서 가설검정의 결과(채택/기각)중 하나를 선택하시오.&lt;/b&gt;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #000000;&quot;&gt;&amp;rarr; pvalue값이 0.4562로 유의수준 0.05보다 높아 귀무가설을 채택한다.&lt;/span&gt;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #000000;&quot;&gt;&amp;rarr; 즉, 대립가설이 기각되므로 근무지와 근무기간이 독립이 아니라고 할 수 없다.&lt;/span&gt;&lt;/p&gt;
&lt;p style=&quot;color: #222222; text-align: left;&quot; data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;pre id=&quot;code_1687456864235&quot; class=&quot;python&quot; data-ke-language=&quot;python&quot; data-ke-type=&quot;codeblock&quot;&gt;&lt;code&gt;# 연속성 수정(Yates's correction for continuity): 기본값 True
chi2_contingency(df, correction=False) 

실행 결과 :
0.82
0.3664&lt;/code&gt;&lt;/pre&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;rarr; correction=True일때와 correction=False의 값이 다르다.&amp;nbsp;&lt;/p&gt;</description>
      <category>빅데이터분석기사/작업형3</category>
      <author>seo0seok</author>
      <guid isPermaLink="true">https://bigdata0seok.tistory.com/32</guid>
      <comments>https://bigdata0seok.tistory.com/32#entry32comment</comments>
      <pubDate>Fri, 23 Jun 2023 03:01:52 +0900</pubDate>
    </item>
  </channel>
</rss>