diff --git a/build.gradle b/build.gradle index c7b5a64..438b377 100644 --- a/build.gradle +++ b/build.gradle @@ -56,6 +56,9 @@ dependencies { testImplementation 'org.springframework.restdocs:spring-restdocs-mockmvc' // ARM Native Library Compatibility runtimeOnly 'io.netty:netty-resolver-dns-native-macos:4.1.104.Final:osx-aarch_64' + // Jsoup Web Crawling Library + implementation 'org.jsoup:jsoup:1.16.2' + } tasks.named('bootBuildImage') { diff --git a/src/main/java/com/jisungin/infra/crawler/Crawler.java b/src/main/java/com/jisungin/infra/crawler/Crawler.java new file mode 100644 index 0000000..653e128 --- /dev/null +++ b/src/main/java/com/jisungin/infra/crawler/Crawler.java @@ -0,0 +1,7 @@ +package com.jisungin.infra.crawler; + +public interface Crawler { + + CrawlingBook crawlBook(String isbn); + +} diff --git a/src/main/java/com/jisungin/infra/crawler/CrawlingBook.java b/src/main/java/com/jisungin/infra/crawler/CrawlingBook.java new file mode 100644 index 0000000..cb5d518 --- /dev/null +++ b/src/main/java/com/jisungin/infra/crawler/CrawlingBook.java @@ -0,0 +1,25 @@ +package com.jisungin.infra.crawler; + +import lombok.Builder; +import lombok.Getter; + +@Getter +public class CrawlingBook { + + private String imageUrl; + private String content; + + @Builder + private CrawlingBook(String imageUrl, String content) { + this.imageUrl = imageUrl; + this.content = content; + } + + public static CrawlingBook of(String imageUrl, String content) { + return CrawlingBook.builder() + .imageUrl(imageUrl) + .content(content) + .build(); + } + +} diff --git a/src/main/java/com/jisungin/infra/crawler/Fetcher.java b/src/main/java/com/jisungin/infra/crawler/Fetcher.java new file mode 100644 index 0000000..d327ed4 --- /dev/null +++ b/src/main/java/com/jisungin/infra/crawler/Fetcher.java @@ -0,0 +1,10 @@ +package com.jisungin.infra.crawler; + +import org.jsoup.nodes.Document; + +public interface Fetcher { + + Document fetchIsbn(String isbn); + Document fetchBook(String bookId); + +} diff --git a/src/main/java/com/jisungin/infra/crawler/Parser.java b/src/main/java/com/jisungin/infra/crawler/Parser.java new file mode 100644 index 0000000..5bacb95 --- /dev/null +++ b/src/main/java/com/jisungin/infra/crawler/Parser.java @@ -0,0 +1,10 @@ +package com.jisungin.infra.crawler; + +import org.jsoup.nodes.Document; + +public interface Parser { + + String parseIsbn(Document doc); + CrawlingBook parseBook(Document doc); + +} diff --git a/src/main/java/com/jisungin/infra/crawler/Yes24Crawler.java b/src/main/java/com/jisungin/infra/crawler/Yes24Crawler.java new file mode 100644 index 0000000..3b52e3b --- /dev/null +++ b/src/main/java/com/jisungin/infra/crawler/Yes24Crawler.java @@ -0,0 +1,20 @@ +package com.jisungin.infra.crawler; + +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Component; + +@Component +@RequiredArgsConstructor +public class Yes24Crawler implements Crawler { + + private final Fetcher fetcher; + private final Parser parser; + + @Override + public CrawlingBook crawlBook(String isbn) { + String bookId = parser.parseIsbn(fetcher.fetchIsbn(isbn)); + + return parser.parseBook(fetcher.fetchBook(bookId)); + } + +} diff --git a/src/main/java/com/jisungin/infra/crawler/Yes24CrawlerConstant.java b/src/main/java/com/jisungin/infra/crawler/Yes24CrawlerConstant.java new file mode 100644 index 0000000..d608123 --- /dev/null +++ b/src/main/java/com/jisungin/infra/crawler/Yes24CrawlerConstant.java @@ -0,0 +1,23 @@ +package com.jisungin.infra.crawler; + +public class Yes24CrawlerConstant { + + public static final String BASE_URL = "https://www.yes24.com/Product"; + public static final String ISBN_URL = BASE_URL + "/Search?domain=BOOK&query="; + public static final String BOOK_URL = BASE_URL + "/Goods/"; + public static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36"; + public static final String ISBN_CSS = "ul#yesSchList > li"; + public static final String ISBN_ATTR = "data-goods-no"; + public static final String BOOK_IMAGE_CSS = "span.gd_img > em.imgBdr > img.gImg"; + public static final String BOOK_IMAGE_ATTR = "src"; + public static final String BOOK_CONTENT_CSS = "div.infoWrap_txt > div.infoWrap_txtInner"; + + public static String getIsbnUrl(String isbn) { + return ISBN_URL + isbn; + } + + public static String getBookUrl(String bookId) { + return BOOK_URL + bookId; + } + +} diff --git a/src/main/java/com/jisungin/infra/crawler/Yes24Fetcher.java b/src/main/java/com/jisungin/infra/crawler/Yes24Fetcher.java new file mode 100644 index 0000000..8288cb7 --- /dev/null +++ b/src/main/java/com/jisungin/infra/crawler/Yes24Fetcher.java @@ -0,0 +1,41 @@ +package com.jisungin.infra.crawler; + +import static com.jisungin.infra.crawler.Yes24CrawlerConstant.*; +import static com.jisungin.infra.crawler.Yes24CrawlerConstant.USER_AGENT; + +import com.jisungin.exception.BusinessException; +import com.jisungin.exception.ErrorCode; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.springframework.stereotype.Component; + +@Component +public class Yes24Fetcher implements Fetcher { + + @Override + public Document fetchIsbn(String isbn) { + try { + return Jsoup.connect(getIsbnUrl(isbn)) + .timeout(5000) + .userAgent(USER_AGENT) + .ignoreContentType(true) + .get(); + } catch (Exception e) { + throw new BusinessException(ErrorCode.BOOK_NOT_FOUND); + } + } + + @Override + public Document fetchBook(String bookId) { + try { + return Jsoup.connect(getBookUrl(bookId)) + .timeout(5000) + .userAgent(USER_AGENT) + .ignoreContentType(true) + .get(); + } catch (Exception e) { + throw new BusinessException(ErrorCode.BOOK_NOT_FOUND); + } + } + +} diff --git a/src/main/java/com/jisungin/infra/crawler/Yes24Parser.java b/src/main/java/com/jisungin/infra/crawler/Yes24Parser.java new file mode 100644 index 0000000..637ab94 --- /dev/null +++ b/src/main/java/com/jisungin/infra/crawler/Yes24Parser.java @@ -0,0 +1,29 @@ +package com.jisungin.infra.crawler; + +import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_CONTENT_CSS; +import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_IMAGE_ATTR; +import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_IMAGE_CSS; +import static com.jisungin.infra.crawler.Yes24CrawlerConstant.ISBN_ATTR; +import static com.jisungin.infra.crawler.Yes24CrawlerConstant.ISBN_CSS; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.safety.Safelist; +import org.springframework.stereotype.Component; + +@Component +public class Yes24Parser implements Parser { + @Override + public String parseIsbn(Document doc) { + return doc.select(ISBN_CSS).attr(ISBN_ATTR); + } + + @Override + public CrawlingBook parseBook(Document doc) { + String image = doc.select(BOOK_IMAGE_CSS).attr(BOOK_IMAGE_ATTR); + String content = Jsoup.clean(doc.select(BOOK_CONTENT_CSS).text(), Safelist.none()); + + return CrawlingBook.of(image, content); + } + +} diff --git a/src/test/java/com/jisungin/infra/Yes24CrawlerTest.java b/src/test/java/com/jisungin/infra/Yes24CrawlerTest.java new file mode 100644 index 0000000..9bbd27f --- /dev/null +++ b/src/test/java/com/jisungin/infra/Yes24CrawlerTest.java @@ -0,0 +1,72 @@ +package com.jisungin.infra; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import com.jisungin.exception.BusinessException; +import com.jisungin.exception.ErrorCode; +import com.jisungin.infra.crawler.CrawlingBook; +import com.jisungin.infra.crawler.Yes24Crawler; +import com.jisungin.infra.crawler.Yes24Fetcher; +import com.jisungin.infra.crawler.Yes24Parser; +import org.jsoup.nodes.Document; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.InjectMocks; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +@ExtendWith(MockitoExtension.class) +public class Yes24CrawlerTest { + + @InjectMocks + private Yes24Crawler crawler; + + @Mock + private Yes24Parser parser; + + @Mock + private Yes24Fetcher fetcher; + + @Test + @DisplayName("isbn을 통해 크롤링 된 책을 생성한다.") + public void crawlingBook() { + // given + String isbn = "0000000000"; + String bookId = "1111111111"; + + Document isbnDocument = mock(Document.class); + Document bookDocument = mock(Document.class); + + CrawlingBook crawlingBook = CrawlingBook.of("image url link", "crawling content"); + + when(fetcher.fetchIsbn(isbn)).thenReturn(isbnDocument); + when(fetcher.fetchBook(bookId)).thenReturn(bookDocument); + when(parser.parseIsbn(isbnDocument)).thenReturn(bookId); + when(parser.parseBook(bookDocument)).thenReturn(crawlingBook); + + // when + CrawlingBook expectedCrawlingBook = crawler.crawlBook(isbn); + + // then + assertThat(expectedCrawlingBook).isEqualTo(crawlingBook); + } + + @Test + @DisplayName("올바르지 않은 isbn을 입력하면 예외가 발생한다.") + public void crawlingBookWithInvalidIsbn() { + // given + String isbn = "XXXXXXXXXX"; + + when(fetcher.fetchIsbn(isbn)).thenThrow(new BusinessException(ErrorCode.BOOK_NOT_FOUND)); + + // when then + assertThatThrownBy(() -> crawler.crawlBook(isbn)) + .isInstanceOf(BusinessException.class) + .hasMessage("책을 찾을 수 없습니다."); + } + +}