`

抓取网站的畅销商品,一个页面20个商品,抓取100个等的实例

阅读更多

1、解析器

package com.yihaodian.pis.crawler;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.tags.Bullet;
import org.htmlparser.tags.BulletList;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.tags.Span;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

import com.yihaodian.pis.dto.BestSellerDto;

public class SuningPageParser extends PageParser{
	private static final Logger logger = Logger.getLogger(SuningPageParser.class);

	public SuningPageParser(String html, String charset) {
		super(html, charset);
		// TODO Auto-generated constructor stub
	}

	@Override
	public List<BestSellerDto> extractBestSeller(String bestSellerHtml)
			throws ParserException {
		  List<BestSellerDto> sellers = new ArrayList<BestSellerDto>();

	        Parser parser = Parser.createParser(bestSellerHtml, charset);
	        NodeFilter filter = new HasAttributeFilter("class", "product_list02 profix02 clearfix");
	        NodeList nodeList = parser.extractAllNodesThatMatch(filter);

	        if (nodeList == null || nodeList.size() == 0) {
	            return null;
	        }

	        Node div = nodeList.elementAt(0);
	        NodeList divchildren = div.getChildren();
	        BulletList ul = (BulletList)divchildren.elementAt(1);
	        NodeList children = ul.getChildren();

	        BestSellerDto bestSeller = null;
	        for (int i = 0; i < children.size(); i++) {
	            bestSeller = new BestSellerDto();

	            Node child = children.elementAt(i);

	            if (child instanceof Bullet) {
	                Bullet li = (Bullet) child;
	                
	                Span nameDiv = (Span) findTagByClassName(li, "pro_intro");
	                //寻找tagName是 LinkTag 的那个 
	                LinkTag link = (LinkTag)findTagByName(nameDiv,"LinkTag");               
	                bestSeller.setName(link.getLinkText());
	                bestSeller.setUrl("http://www.suning.cn"+link.getLink());

	                Span pricespan = (Span) findTagByClassName(li, "pro_price");
	                String priceteString =pricespan.getChildrenHTML().replace("<em>", "").replace("</em>", "").replace("¥", "");
	                bestSeller.setPrice(priceteString);
	                logger.info("畅销单品:" + bestSeller);
	                sellers.add(bestSeller);
	            } else {
	                continue;
	            }
	        }

	        return sellers;
	}

	@Override
	public String extractNextPageUrlPattern(String bestSellerHtml)
			throws ParserException {
		String nextPageUrl="";
		Parser parser = Parser.createParser(bestSellerHtml, "utf-8");
		NodeFilter filter = new HasAttributeFilter("type", "text/javascript");;
        NodeList children = parser.extractAllNodesThatMatch(filter);

        if (children == null || children.size() == 0) {
            System.out.println("没有值");
        }else{
        	System.out.println("有值");
        }
        for (int i = 0; i < children.size(); i++) {
            ScriptTag child = (ScriptTag) children.elementAt(i);
		if(child.findPositionOf("&currentPage=")==0){
			String putInCart1 = null;
			String putInCart2 = null;
			//Pattern pattern2 = Pattern.compile("(?<=currentPage[)] \\{)([^\\}]*?)(?=\\})");
			Pattern pattern2 = 
		    Pattern.compile("(?<=var[ \\s]{0,100}(jumpUrl)[\\s]{0,100}[=][\\s]{0,100}[\"])(.*?)(?=\"\\s{0,100}[+])");
			Pattern pattern1 = Pattern.compile("(?<=var[ \\s]{0,100}dfy\\s{0,100}=\\s{0,100}[\"])(.*?)(?=[\"][\\s]{0,100})");
			Matcher matcher1 = pattern1.matcher(child.getChildrenHTML());			
			if (matcher1.find()) {
				putInCart1 = matcher1.group(0).trim();
			}	
			Matcher matcher2 = pattern2.matcher(child.getChildrenHTML());			
			if (matcher2.find()) {
				putInCart2 = matcher2.group(0).trim();
			}
			//System.out.println(putInCart2.substring(15, putInCart2.indexOf(" + dfy")-1));
			nextPageUrl=putInCart2+putInCart1;
		}
        }
       // parser = Parser.createParser(bestSellerHtml, "utf-8");
        //得到当前页currentPage
//        String currentPage="";
//        filter = new HasAttributeFilter("class", "on");
//        children = parser.extractAllNodesThatMatch(filter);
        //LinkTag dLinkTag = (LinkTag) children.elementAt(0);
        //System.out.println(dLinkTag.getLinkText());
//        for (int i = 0; i < children.size(); i++) {
//			Node node =children.elementAt(i);
//			if (node.getChildren().size()<2) {
//				LinkTag dLinkTag = (LinkTag)node;
//				if(dLinkTag.getLink().equals("#"))
//					currentPage= dLinkTag.getLinkText();
//			}
//		}
        nextPageUrl+="&ip_sortBy=salevolumn0&sortType=4&currentPage=";// + currentPage;
        logger.info("畅销榜下一页URL模式:" + nextPageUrl);

        return nextPageUrl;
	}

	@Override
	public String getNextPageUrl(String nextPageUrlPattern, int pageNum) {
		    StringBuilder sb = new StringBuilder();
	        sb.append("http://www.suning.cn/webapp/wcs/stores/servlet/");
	        sb.append(nextPageUrlPattern+(pageNum-1));

	        String nextPageUrl = sb.toString();

	        return nextPageUrl;
	}

	@Override
	public String extractName(Map<String, String> params)
			throws ParserException {
		// TODO Auto-generated method stub
		return null;
	}

	@Override
	public String extractPrice(Map<String, String> params)
			throws ParserException {
		// TODO Auto-generated method stub
		return null;
	}

	@Override
	public String extractBrand(Map<String, String> params)
			throws ParserException {
		// TODO Auto-generated method stub
		return null;
	}

	@Override
	public String extractImageUrl(Map<String, String> params)
			throws ParserException {
		// TODO Auto-generated method stub
		return null;
	}

	@Override
	public boolean hasProduct(Map<String, String> params)
			throws ParserException {
		// TODO Auto-generated method stub
		return false;
	}

}

 2、主程序DAO

 

 public List<BestSellerDto> fetchBestSeller(Integer id, int amount) {
        List<BestSellerDto> bestSellers = new ArrayList<BestSellerDto>();

        if (amount <= 0) {
            logger.warn("畅销品数目不能为负数!");
            return null;
        }
        SiteCategoryDto siteCategory = getSiteCategoryById(id);

        // 抓取参数
        Map<String, String> params;

        // 组织抓取参数与页面参数
        params = pageParamItemDao.getPageConfigBySite(siteCategory.getSiteId());
        params.putAll(crawlerParamItemDao.getCrawlConfigBySite(siteCategory.
                getSiteId()));

        // 构建抓取对象
        String charset = params.get(PageParamNames.CONTENT_ENCODING);
        int pageSize = Integer.parseInt(params.get(PageParamNames.BS_PAGE_SIZE));
        int pages = amount / pageSize + 1;

         logger.info("畅销榜页数:" + pages);

        Crawler crawler = new Crawler(charset);
        String bestSellerHtml = null;
        PageParser pageParser = null;

        try {
            String categoryUrl = siteCategory.getCategoryUrl();
            if (categoryUrl == null || categoryUrl.equals("")) {
                logger.info("此分类不支持畅销榜!");
                return null;
            }

            bestSellerHtml = crawler.crawl(siteCategory.getCategoryUrl());
            pageParser = PageParserFactory.createPageParser(null, charset,
                    params);
            String nextPageUrlPattern=pageParser.extractNextPageUrlPattern(bestSellerHtml);
            if (siteCategory.getCategoryUrl().indexOf("suning")>0) {
            	String nextPageUrl0 = pageParser.getNextPageUrl(
                        nextPageUrlPattern, 1);
                bestSellerHtml = crawler.crawl(nextPageUrl0);
			}
            bestSellers = pageParser.extractBestSeller(bestSellerHtml);
            //if(amount<=bestSellers.size()) return bestSellers;
            
            if (nextPageUrlPattern != null) {
                for (int pageNum = 2; pageNum <= pages; pageNum++) {
                    logger.info("抓取畅销榜第 " + pageNum + " 页");

                    String nextPageUrl = pageParser.getNextPageUrl(
                            nextPageUrlPattern, pageNum);
                    bestSellerHtml = crawler.crawl(nextPageUrl);
                    List<BestSellerDto> moreBestSellers = pageParser.
                            extractBestSeller(bestSellerHtml);
                    if (moreBestSellers == null || moreBestSellers.isEmpty()) {
                        break;
                    }
                    if (bestSellers.get(bestSellers.size() - 1).getUrl().equals(
                            moreBestSellers.get(moreBestSellers.size() - 1).
                            getUrl())) {
                        break;
                    }
                    if (moreBestSellers != null && !moreBestSellers.isEmpty()) {
                        bestSellers.addAll(moreBestSellers);
                    }
                }
            }
        } catch (IOException e) {
            logger.error("抓取 " + siteCategory.getCategoryName() + " 畅销榜时出现异常!"
                    + "URL为:" + siteCategory.getCategoryUrl(), e);
        } catch (ParserException e) {
            logger.error("解析畅销榜页面时出现异常!" + "URL为:"
                    + siteCategory.getCategoryUrl(), e);
        }

        logger.info("++++++++++++++++++++++++++++++++++");
        logger.info("畅销集合大小:" + (bestSellers == null ? 0 : bestSellers.size()));
        if (bestSellers != null) {
            if (bestSellers.size() <= amount) {
                return bestSellers;
            }

            List<BestSellerDto> subList = bestSellers.subList(0, amount);
            logger.info("----------------------------------");
            logger.info("畅销集合大小:" + subList.size());

            return subList;
        } else {
            return new ArrayList<BestSellerDto>();
        }
    }

 不懂的联系QQ526151410

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics