diff --git a/.gitignore b/.gitignore index 6bd7fc8..b7a6e16 100644 --- a/.gitignore +++ b/.gitignore @@ -40,4 +40,5 @@ out/ application-dev.yml application-prod.yml application-jwt.yml -application-oauth.yml \ No newline at end of file +application-oauth.yml +application-crawler.yml \ No newline at end of file diff --git a/src/main/java/com/jisungin/exception/ErrorCode.java b/src/main/java/com/jisungin/exception/ErrorCode.java index f8d5bc2..e6a3e33 100644 --- a/src/main/java/com/jisungin/exception/ErrorCode.java +++ b/src/main/java/com/jisungin/exception/ErrorCode.java @@ -16,7 +16,8 @@ public enum ErrorCode { TALK_ROOM_NOT_FOUND(400, "토크방을 찾을 수 없습니다."), UNAUTHORIZED_REQUEST(400, "권한이 없는 사용자입니다."), COMMENT_NOT_FOUND(404, "의견을 찾을 수 없습니다."), - REVIEW_NOT_FOUND(404, "리뷰를 찾을 수 없습니다."); + REVIEW_NOT_FOUND(404, "리뷰를 찾을 수 없습니다."), + REQUEST_TIME_OUT(408, "요청 시간이 만료 되었습니다."); private final int code; diff --git a/src/main/java/com/jisungin/infra/crawler/Crawler.java b/src/main/java/com/jisungin/infra/crawler/Crawler.java index 653e128..5ffc118 100644 --- a/src/main/java/com/jisungin/infra/crawler/Crawler.java +++ b/src/main/java/com/jisungin/infra/crawler/Crawler.java @@ -1,7 +1,10 @@ package com.jisungin.infra.crawler; +import java.util.Map; + public interface Crawler { CrawlingBook crawlBook(String isbn); + Map crawlBestSellerBook(); } diff --git a/src/main/java/com/jisungin/infra/crawler/CrawlingBook.java b/src/main/java/com/jisungin/infra/crawler/CrawlingBook.java index 9de8f88..54fa4e9 100644 --- a/src/main/java/com/jisungin/infra/crawler/CrawlingBook.java +++ b/src/main/java/com/jisungin/infra/crawler/CrawlingBook.java @@ -1,24 +1,47 @@ package com.jisungin.infra.crawler; +import java.time.LocalDateTime; import lombok.Builder; import lombok.Getter; +import lombok.ToString; @Getter +@ToString public class CrawlingBook { - private String imageUrl; + private String title; private String content; + private String isbn; + private String publisher; + private String imageUrl; + private String thumbnail; + private String[] authors; + private LocalDateTime dateTime; @Builder - private CrawlingBook(String imageUrl, String content) { - this.imageUrl = imageUrl; + private CrawlingBook(String title, String content, String isbn, String publisher, String imageUrl, String thumbnail, + String authors, LocalDateTime dateTime) { + this.title = title; this.content = content; + this.isbn = isbn; + this.publisher = publisher; + this.imageUrl = imageUrl; + this.thumbnail = thumbnail; + this.authors = parseAuthorsToArr(authors); + this.dateTime = dateTime; } - public static CrawlingBook of(String imageUrl, String content) { + public static CrawlingBook of(String title, String content, String isbn, String publisher, String imageUrl, + String thumbnail, String authors, LocalDateTime dateTime) { return CrawlingBook.builder() - .imageUrl(imageUrl) + .title(title) .content(content) + .isbn(isbn) + .publisher(publisher) + .imageUrl(imageUrl) + .thumbnail(thumbnail) + .authors(authors) + .dateTime(dateTime) .build(); } @@ -26,4 +49,8 @@ public boolean isBlankContent() { return this.content.isBlank(); } + private String[] parseAuthorsToArr(String authors) { + return authors.split(" 저| 공저| 글| 편저| 원저")[0].split(","); + } + } diff --git a/src/main/java/com/jisungin/infra/crawler/Fetcher.java b/src/main/java/com/jisungin/infra/crawler/Fetcher.java index d327ed4..e95ccdd 100644 --- a/src/main/java/com/jisungin/infra/crawler/Fetcher.java +++ b/src/main/java/com/jisungin/infra/crawler/Fetcher.java @@ -6,5 +6,6 @@ public interface Fetcher { Document fetchIsbn(String isbn); Document fetchBook(String bookId); + Document fetchBestSellerBookId(); } diff --git a/src/main/java/com/jisungin/infra/crawler/Parser.java b/src/main/java/com/jisungin/infra/crawler/Parser.java index 5bacb95..5d61182 100644 --- a/src/main/java/com/jisungin/infra/crawler/Parser.java +++ b/src/main/java/com/jisungin/infra/crawler/Parser.java @@ -1,10 +1,12 @@ package com.jisungin.infra.crawler; +import java.util.Map; import org.jsoup.nodes.Document; public interface Parser { String parseIsbn(Document doc); CrawlingBook parseBook(Document doc); + Map parseBestSellerBookId(Document doc); } diff --git a/src/main/java/com/jisungin/infra/crawler/Yes24Crawler.java b/src/main/java/com/jisungin/infra/crawler/Yes24Crawler.java index 3b52e3b..97e7127 100644 --- a/src/main/java/com/jisungin/infra/crawler/Yes24Crawler.java +++ b/src/main/java/com/jisungin/infra/crawler/Yes24Crawler.java @@ -1,5 +1,9 @@ package com.jisungin.infra.crawler; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CompletableFuture; import lombok.RequiredArgsConstructor; import org.springframework.stereotype.Component; @@ -17,4 +21,19 @@ public CrawlingBook crawlBook(String isbn) { return parser.parseBook(fetcher.fetchBook(bookId)); } + @Override + public Map crawlBestSellerBook() { + Map bestSellerBookIds = parser.parseBestSellerBookId(fetcher.fetchBestSellerBookId()); + Map bestSellerBooks = new HashMap<>(); + + List> futures = bestSellerBookIds.entrySet().stream() + .map(entry -> CompletableFuture.supplyAsync(() -> parser.parseBook(fetcher.fetchBook(entry.getValue()))) + .thenAccept(crawlingBook -> bestSellerBooks.put(entry.getKey(), crawlingBook))) + .toList(); + + CompletableFuture.allOf(futures.toArray(CompletableFuture[]::new)).join(); + + return bestSellerBooks; + } + } diff --git a/src/main/java/com/jisungin/infra/crawler/Yes24CrawlerConstant.java b/src/main/java/com/jisungin/infra/crawler/Yes24CrawlerConstant.java deleted file mode 100644 index d608123..0000000 --- a/src/main/java/com/jisungin/infra/crawler/Yes24CrawlerConstant.java +++ /dev/null @@ -1,23 +0,0 @@ -package com.jisungin.infra.crawler; - -public class Yes24CrawlerConstant { - - public static final String BASE_URL = "https://www.yes24.com/Product"; - public static final String ISBN_URL = BASE_URL + "/Search?domain=BOOK&query="; - public static final String BOOK_URL = BASE_URL + "/Goods/"; - public static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36"; - public static final String ISBN_CSS = "ul#yesSchList > li"; - public static final String ISBN_ATTR = "data-goods-no"; - public static final String BOOK_IMAGE_CSS = "span.gd_img > em.imgBdr > img.gImg"; - public static final String BOOK_IMAGE_ATTR = "src"; - public static final String BOOK_CONTENT_CSS = "div.infoWrap_txt > div.infoWrap_txtInner"; - - public static String getIsbnUrl(String isbn) { - return ISBN_URL + isbn; - } - - public static String getBookUrl(String bookId) { - return BOOK_URL + bookId; - } - -} diff --git a/src/main/java/com/jisungin/infra/crawler/Yes24Fetcher.java b/src/main/java/com/jisungin/infra/crawler/Yes24Fetcher.java index 8288cb7..ac6efc9 100644 --- a/src/main/java/com/jisungin/infra/crawler/Yes24Fetcher.java +++ b/src/main/java/com/jisungin/infra/crawler/Yes24Fetcher.java @@ -1,25 +1,34 @@ package com.jisungin.infra.crawler; -import static com.jisungin.infra.crawler.Yes24CrawlerConstant.*; -import static com.jisungin.infra.crawler.Yes24CrawlerConstant.USER_AGENT; - import com.jisungin.exception.BusinessException; import com.jisungin.exception.ErrorCode; +import java.net.SocketTimeoutException; +import lombok.Setter; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; +import org.springframework.boot.context.properties.ConfigurationProperties; import org.springframework.stereotype.Component; @Component +@Setter +@ConfigurationProperties(prefix = "crawler.yes24.fetcher") public class Yes24Fetcher implements Fetcher { + private String isbnUrl; + private String bookUrl; + private String bestBookUrl; + private String userAgent; + @Override public Document fetchIsbn(String isbn) { try { return Jsoup.connect(getIsbnUrl(isbn)) .timeout(5000) - .userAgent(USER_AGENT) + .userAgent(userAgent) .ignoreContentType(true) .get(); + } catch (SocketTimeoutException e) { + throw new BusinessException(ErrorCode.REQUEST_TIME_OUT); } catch (Exception e) { throw new BusinessException(ErrorCode.BOOK_NOT_FOUND); } @@ -30,12 +39,37 @@ public Document fetchBook(String bookId) { try { return Jsoup.connect(getBookUrl(bookId)) .timeout(5000) - .userAgent(USER_AGENT) + .userAgent(userAgent) + .ignoreContentType(true) + .get(); + } catch (SocketTimeoutException e) { + throw new BusinessException(ErrorCode.REQUEST_TIME_OUT); + } catch (Exception e) { + throw new BusinessException(ErrorCode.BOOK_NOT_FOUND); + } + } + + @Override + public Document fetchBestSellerBookId() { + try { + return Jsoup.connect(bestBookUrl) + .timeout(5000) + .userAgent(userAgent) .ignoreContentType(true) .get(); + } catch (SocketTimeoutException e) { + throw new BusinessException(ErrorCode.REQUEST_TIME_OUT); } catch (Exception e) { throw new BusinessException(ErrorCode.BOOK_NOT_FOUND); } } + private String getIsbnUrl(String isbn) { + return isbnUrl + isbn; + } + + private String getBookUrl(String bookId) { + return bookUrl + bookId; + } + } diff --git a/src/main/java/com/jisungin/infra/crawler/Yes24Parser.java b/src/main/java/com/jisungin/infra/crawler/Yes24Parser.java index 637ab94..8b768a2 100644 --- a/src/main/java/com/jisungin/infra/crawler/Yes24Parser.java +++ b/src/main/java/com/jisungin/infra/crawler/Yes24Parser.java @@ -1,29 +1,78 @@ package com.jisungin.infra.crawler; -import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_CONTENT_CSS; -import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_IMAGE_ATTR; -import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_IMAGE_CSS; -import static com.jisungin.infra.crawler.Yes24CrawlerConstant.ISBN_ATTR; -import static com.jisungin.infra.crawler.Yes24CrawlerConstant.ISBN_CSS; - +import com.jayway.jsonpath.JsonPath; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import lombok.Setter; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; import org.jsoup.safety.Safelist; +import org.jsoup.select.Elements; +import org.springframework.boot.context.properties.ConfigurationProperties; import org.springframework.stereotype.Component; @Component +@Setter +@ConfigurationProperties(prefix = "crawler.yes24.parser") public class Yes24Parser implements Parser { + + private String isbnCss; + private String isbnAttr; + private String bookContentCss; + private String bookJsonCss; + private String bestRankingCss; + private String bestIdCss; + private String bestIdAttrs; + @Override public String parseIsbn(Document doc) { - return doc.select(ISBN_CSS).attr(ISBN_ATTR); + return doc.select(isbnCss).attr(isbnAttr); } @Override public CrawlingBook parseBook(Document doc) { - String image = doc.select(BOOK_IMAGE_CSS).attr(BOOK_IMAGE_ATTR); - String content = Jsoup.clean(doc.select(BOOK_CONTENT_CSS).text(), Safelist.none()); + String json = doc.select(bookJsonCss).html(); + + String title = parseJsonToString(json, "$.name"); + String isbn = parseJsonToString(json, "$.workExample[0].isbn"); + String imageUrl = parseJsonToString(json, "$.image"); + String publisher = parseJsonToString(json, "$.publisher.name"); + String authors = parseJsonToString(json, "$.author.name"); + String thumbnail = imageUrl.replace("XL", "M"); + String content = Jsoup.clean(doc.select(bookContentCss).text(), Safelist.none()); + LocalDateTime dateTime = parseDate(parseJsonToString(json, "$.workExample[0].datePublished")); + + return CrawlingBook.of(title, content, isbn, publisher, imageUrl, thumbnail, authors, dateTime); + } + + @Override + public Map parseBestSellerBookId(Document doc) { + Elements rankings = doc.select(bestRankingCss); + List bookIds = doc.select(bestIdCss) + .eachAttr(bestIdAttrs); + + return IntStream.range(0, rankings.size()) + .boxed() + .collect(Collectors.toMap( + i -> parseRanking(rankings.get(i)), + bookIds::get)); + } + + private Long parseRanking(Element rankingElement) { + return Long.parseLong(rankingElement.text()); + } + + private String parseJsonToString(String json, String path) { + return JsonPath.read(json, path); + } - return CrawlingBook.of(image, content); + private LocalDateTime parseDate(String dateString) { + return LocalDate.parse(dateString).atStartOfDay(); } } diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 735ecfd..34de983 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -10,4 +10,5 @@ spring: prod-env: - prod include: - oauth \ No newline at end of file + - oauth + - crawler \ No newline at end of file