From c1804fa962c9b6d3539e1c92e68a096d313fb019 Mon Sep 17 00:00:00 2001 From: jwooo Date: Sat, 23 Mar 2024 09:38:45 +0900 Subject: [PATCH 1/5] =?UTF-8?q?chore:=20Jsoup=20=EB=9D=BC=EC=9D=B4?= =?UTF-8?q?=EB=B8=8C=EB=9F=AC=EB=A6=AC=20=EC=9D=98=EC=A1=B4=EC=84=B1=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80=20(#17)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build.gradle | 3 +++ 1 file changed, 3 insertions(+) diff --git a/build.gradle b/build.gradle index c7b5a64..438b377 100644 --- a/build.gradle +++ b/build.gradle @@ -56,6 +56,9 @@ dependencies { testImplementation 'org.springframework.restdocs:spring-restdocs-mockmvc' // ARM Native Library Compatibility runtimeOnly 'io.netty:netty-resolver-dns-native-macos:4.1.104.Final:osx-aarch_64' + // Jsoup Web Crawling Library + implementation 'org.jsoup:jsoup:1.16.2' + } tasks.named('bootBuildImage') { From b8ffa4a94d1cc9a74cbae1dadeb5018b47523fde Mon Sep 17 00:00:00 2001 From: jwooo Date: Sat, 23 Mar 2024 10:44:33 +0900 Subject: [PATCH 2/5] =?UTF-8?q?feat:=20=ED=81=AC=EB=A1=A4=EB=A7=81=20?= =?UTF-8?q?=EC=9D=B8=ED=84=B0=ED=8E=98=EC=9D=B4=EC=8A=A4=20=EC=B6=94?= =?UTF-8?q?=EA=B0=80=20(#17)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/main/java/com/jisungin/infra/crawler/Crawler.java | 7 +++++++ src/main/java/com/jisungin/infra/crawler/Fetcher.java | 10 ++++++++++ src/main/java/com/jisungin/infra/crawler/Parser.java | 10 ++++++++++ 3 files changed, 27 insertions(+) create mode 100644 src/main/java/com/jisungin/infra/crawler/Crawler.java create mode 100644 src/main/java/com/jisungin/infra/crawler/Fetcher.java create mode 100644 src/main/java/com/jisungin/infra/crawler/Parser.java diff --git a/src/main/java/com/jisungin/infra/crawler/Crawler.java b/src/main/java/com/jisungin/infra/crawler/Crawler.java new file mode 100644 index 0000000..653e128 --- /dev/null +++ b/src/main/java/com/jisungin/infra/crawler/Crawler.java @@ -0,0 +1,7 @@ +package com.jisungin.infra.crawler; + +public interface Crawler { + + CrawlingBook crawlBook(String isbn); + +} diff --git a/src/main/java/com/jisungin/infra/crawler/Fetcher.java b/src/main/java/com/jisungin/infra/crawler/Fetcher.java new file mode 100644 index 0000000..d327ed4 --- /dev/null +++ b/src/main/java/com/jisungin/infra/crawler/Fetcher.java @@ -0,0 +1,10 @@ +package com.jisungin.infra.crawler; + +import org.jsoup.nodes.Document; + +public interface Fetcher { + + Document fetchIsbn(String isbn); + Document fetchBook(String bookId); + +} diff --git a/src/main/java/com/jisungin/infra/crawler/Parser.java b/src/main/java/com/jisungin/infra/crawler/Parser.java new file mode 100644 index 0000000..5bacb95 --- /dev/null +++ b/src/main/java/com/jisungin/infra/crawler/Parser.java @@ -0,0 +1,10 @@ +package com.jisungin.infra.crawler; + +import org.jsoup.nodes.Document; + +public interface Parser { + + String parseIsbn(Document doc); + CrawlingBook parseBook(Document doc); + +} From 93cc433faa43716edcce624bd3679a9841f38be5 Mon Sep 17 00:00:00 2001 From: jwooo Date: Sat, 23 Mar 2024 10:44:58 +0900 Subject: [PATCH 3/5] =?UTF-8?q?feat:=20Yes24=20=EC=B1=85=20=EB=8B=A8?= =?UTF-8?q?=EA=B1=B4=20=ED=81=AC=EB=A1=A4=EB=A7=81=20=EA=B8=B0=EB=8A=A5=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80=20(#17)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../jisungin/infra/crawler/CrawlingBook.java | 25 +++++++++++ .../jisungin/infra/crawler/Yes24Crawler.java | 20 +++++++++ .../infra/crawler/Yes24CrawlerConstant.java | 23 +++++++++++ .../jisungin/infra/crawler/Yes24Fetcher.java | 41 +++++++++++++++++++ .../jisungin/infra/crawler/Yes24Parser.java | 29 +++++++++++++ 5 files changed, 138 insertions(+) create mode 100644 src/main/java/com/jisungin/infra/crawler/CrawlingBook.java create mode 100644 src/main/java/com/jisungin/infra/crawler/Yes24Crawler.java create mode 100644 src/main/java/com/jisungin/infra/crawler/Yes24CrawlerConstant.java create mode 100644 src/main/java/com/jisungin/infra/crawler/Yes24Fetcher.java create mode 100644 src/main/java/com/jisungin/infra/crawler/Yes24Parser.java diff --git a/src/main/java/com/jisungin/infra/crawler/CrawlingBook.java b/src/main/java/com/jisungin/infra/crawler/CrawlingBook.java new file mode 100644 index 0000000..cb5d518 --- /dev/null +++ b/src/main/java/com/jisungin/infra/crawler/CrawlingBook.java @@ -0,0 +1,25 @@ +package com.jisungin.infra.crawler; + +import lombok.Builder; +import lombok.Getter; + +@Getter +public class CrawlingBook { + + private String imageUrl; + private String content; + + @Builder + private CrawlingBook(String imageUrl, String content) { + this.imageUrl = imageUrl; + this.content = content; + } + + public static CrawlingBook of(String imageUrl, String content) { + return CrawlingBook.builder() + .imageUrl(imageUrl) + .content(content) + .build(); + } + +} diff --git a/src/main/java/com/jisungin/infra/crawler/Yes24Crawler.java b/src/main/java/com/jisungin/infra/crawler/Yes24Crawler.java new file mode 100644 index 0000000..3b52e3b --- /dev/null +++ b/src/main/java/com/jisungin/infra/crawler/Yes24Crawler.java @@ -0,0 +1,20 @@ +package com.jisungin.infra.crawler; + +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Component; + +@Component +@RequiredArgsConstructor +public class Yes24Crawler implements Crawler { + + private final Fetcher fetcher; + private final Parser parser; + + @Override + public CrawlingBook crawlBook(String isbn) { + String bookId = parser.parseIsbn(fetcher.fetchIsbn(isbn)); + + return parser.parseBook(fetcher.fetchBook(bookId)); + } + +} diff --git a/src/main/java/com/jisungin/infra/crawler/Yes24CrawlerConstant.java b/src/main/java/com/jisungin/infra/crawler/Yes24CrawlerConstant.java new file mode 100644 index 0000000..d608123 --- /dev/null +++ b/src/main/java/com/jisungin/infra/crawler/Yes24CrawlerConstant.java @@ -0,0 +1,23 @@ +package com.jisungin.infra.crawler; + +public class Yes24CrawlerConstant { + + public static final String BASE_URL = "https://www.yes24.com/Product"; + public static final String ISBN_URL = BASE_URL + "/Search?domain=BOOK&query="; + public static final String BOOK_URL = BASE_URL + "/Goods/"; + public static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36"; + public static final String ISBN_CSS = "ul#yesSchList > li"; + public static final String ISBN_ATTR = "data-goods-no"; + public static final String BOOK_IMAGE_CSS = "span.gd_img > em.imgBdr > img.gImg"; + public static final String BOOK_IMAGE_ATTR = "src"; + public static final String BOOK_CONTENT_CSS = "div.infoWrap_txt > div.infoWrap_txtInner"; + + public static String getIsbnUrl(String isbn) { + return ISBN_URL + isbn; + } + + public static String getBookUrl(String bookId) { + return BOOK_URL + bookId; + } + +} diff --git a/src/main/java/com/jisungin/infra/crawler/Yes24Fetcher.java b/src/main/java/com/jisungin/infra/crawler/Yes24Fetcher.java new file mode 100644 index 0000000..8288cb7 --- /dev/null +++ b/src/main/java/com/jisungin/infra/crawler/Yes24Fetcher.java @@ -0,0 +1,41 @@ +package com.jisungin.infra.crawler; + +import static com.jisungin.infra.crawler.Yes24CrawlerConstant.*; +import static com.jisungin.infra.crawler.Yes24CrawlerConstant.USER_AGENT; + +import com.jisungin.exception.BusinessException; +import com.jisungin.exception.ErrorCode; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.springframework.stereotype.Component; + +@Component +public class Yes24Fetcher implements Fetcher { + + @Override + public Document fetchIsbn(String isbn) { + try { + return Jsoup.connect(getIsbnUrl(isbn)) + .timeout(5000) + .userAgent(USER_AGENT) + .ignoreContentType(true) + .get(); + } catch (Exception e) { + throw new BusinessException(ErrorCode.BOOK_NOT_FOUND); + } + } + + @Override + public Document fetchBook(String bookId) { + try { + return Jsoup.connect(getBookUrl(bookId)) + .timeout(5000) + .userAgent(USER_AGENT) + .ignoreContentType(true) + .get(); + } catch (Exception e) { + throw new BusinessException(ErrorCode.BOOK_NOT_FOUND); + } + } + +} diff --git a/src/main/java/com/jisungin/infra/crawler/Yes24Parser.java b/src/main/java/com/jisungin/infra/crawler/Yes24Parser.java new file mode 100644 index 0000000..637ab94 --- /dev/null +++ b/src/main/java/com/jisungin/infra/crawler/Yes24Parser.java @@ -0,0 +1,29 @@ +package com.jisungin.infra.crawler; + +import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_CONTENT_CSS; +import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_IMAGE_ATTR; +import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_IMAGE_CSS; +import static com.jisungin.infra.crawler.Yes24CrawlerConstant.ISBN_ATTR; +import static com.jisungin.infra.crawler.Yes24CrawlerConstant.ISBN_CSS; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.safety.Safelist; +import org.springframework.stereotype.Component; + +@Component +public class Yes24Parser implements Parser { + @Override + public String parseIsbn(Document doc) { + return doc.select(ISBN_CSS).attr(ISBN_ATTR); + } + + @Override + public CrawlingBook parseBook(Document doc) { + String image = doc.select(BOOK_IMAGE_CSS).attr(BOOK_IMAGE_ATTR); + String content = Jsoup.clean(doc.select(BOOK_CONTENT_CSS).text(), Safelist.none()); + + return CrawlingBook.of(image, content); + } + +} From 58ea8da92c2ecfdcf10dd0f476ad09106cb4dff6 Mon Sep 17 00:00:00 2001 From: jwooo Date: Sat, 23 Mar 2024 12:08:24 +0900 Subject: [PATCH 4/5] =?UTF-8?q?test:=20Yes24=20=EC=B1=85=20=EB=8B=A8?= =?UTF-8?q?=EA=B1=B4=20=ED=81=AC=EB=A1=A4=EB=A7=81=20=ED=85=8C=EC=8A=A4?= =?UTF-8?q?=ED=8A=B8=20=EC=B6=94=EA=B0=80=20(#17)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../com/jisungin/infra/Yes24CrawlerTest.java | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 src/test/java/com/jisungin/infra/Yes24CrawlerTest.java diff --git a/src/test/java/com/jisungin/infra/Yes24CrawlerTest.java b/src/test/java/com/jisungin/infra/Yes24CrawlerTest.java new file mode 100644 index 0000000..a3f0a6c --- /dev/null +++ b/src/test/java/com/jisungin/infra/Yes24CrawlerTest.java @@ -0,0 +1,72 @@ +package com.jisungin.infra; + +import static org.assertj.core.api.Assertions.*; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import com.jisungin.exception.BusinessException; +import com.jisungin.exception.ErrorCode; +import com.jisungin.infra.crawler.CrawlingBook; +import com.jisungin.infra.crawler.Yes24Crawler; +import com.jisungin.infra.crawler.Yes24Fetcher; +import com.jisungin.infra.crawler.Yes24Parser; +import org.assertj.core.api.Assertions; +import org.jsoup.nodes.Document; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.InjectMocks; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +@ExtendWith(MockitoExtension.class) +public class Yes24CrawlerTest { + + @InjectMocks + private Yes24Crawler crawler; + + @Mock + private Yes24Parser parser; + + @Mock + private Yes24Fetcher fetcher; + + @Test + @DisplayName("isbn을 통해 크롤링 된 책을 생성한다.") + public void crawlingBook() { + // given + String isbn = "0000000000"; + String bookId = "1111111111"; + + Document isbnDocument = mock(Document.class); + Document bookDocument = mock(Document.class); + + CrawlingBook crawlingBook = CrawlingBook.of("image url link", "crawling content"); + + when(fetcher.fetchIsbn(isbn)).thenReturn(isbnDocument); + when(fetcher.fetchBook(bookId)).thenReturn(bookDocument); + when(parser.parseIsbn(isbnDocument)).thenReturn(bookId); + when(parser.parseBook(bookDocument)).thenReturn(crawlingBook); + + // when + CrawlingBook expectedCrawlingBook = crawler.crawlBook(isbn); + + // then + assertThat(expectedCrawlingBook).isEqualTo(crawlingBook); + } + + @Test + @DisplayName("올바르지 않은 isbn을 입력하면 예외가 발생한다.") + public void crawlingBookWithInvalidIsbn() { + // given + String isbn = "XXXXXXXXXX"; + + when(fetcher.fetchIsbn(isbn)).thenThrow(new BusinessException(ErrorCode.BOOK_NOT_FOUND)); + + // when then + Assertions.assertThatThrownBy(() -> crawler.crawlBook(isbn)) + .isInstanceOf(BusinessException.class) + .hasMessage("책을 찾을 수 없습니다."); + } + +} From 6a6eefc674ef86f3a3f28a828f18a5a83a301814 Mon Sep 17 00:00:00 2001 From: jwooo Date: Sat, 23 Mar 2024 12:26:22 +0900 Subject: [PATCH 5/5] =?UTF-8?q?test:=20Yes24CrawlerTest=20static=20import?= =?UTF-8?q?=EB=AC=B8=20=EC=B6=94=EA=B0=80=20(#17)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/test/java/com/jisungin/infra/Yes24CrawlerTest.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/test/java/com/jisungin/infra/Yes24CrawlerTest.java b/src/test/java/com/jisungin/infra/Yes24CrawlerTest.java index a3f0a6c..9bbd27f 100644 --- a/src/test/java/com/jisungin/infra/Yes24CrawlerTest.java +++ b/src/test/java/com/jisungin/infra/Yes24CrawlerTest.java @@ -1,6 +1,7 @@ package com.jisungin.infra; -import static org.assertj.core.api.Assertions.*; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -10,7 +11,6 @@ import com.jisungin.infra.crawler.Yes24Crawler; import com.jisungin.infra.crawler.Yes24Fetcher; import com.jisungin.infra.crawler.Yes24Parser; -import org.assertj.core.api.Assertions; import org.jsoup.nodes.Document; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Test; @@ -64,7 +64,7 @@ public void crawlingBookWithInvalidIsbn() { when(fetcher.fetchIsbn(isbn)).thenThrow(new BusinessException(ErrorCode.BOOK_NOT_FOUND)); // when then - Assertions.assertThatThrownBy(() -> crawler.crawlBook(isbn)) + assertThatThrownBy(() -> crawler.crawlBook(isbn)) .isInstanceOf(BusinessException.class) .hasMessage("책을 찾을 수 없습니다."); }