自从使用hexo在github page更新博客之后,我每次在cnblog上发布文章,需要手动再更新hexo。hexo使用markdown格式来写文章,手动更新需要对文章本身内容进行转化,做成md文件再进行上传,后来就想到,本身爬虫就可以对页面中的各种元素进行提取,同时markdown使用的是标记语法,那么使用爬虫分析文章元素,提取主要内容并且根据模板自动生成对应的md文件理论上是可行的。
由于我的hexo博客使用的是默认布局,所以在hexo目录下直接执行:
这样就生成了一个model.md文件,接下来就是把这个文件改装成所需的模板,打开model.md,能看到默认布局hexo文章只有一个简单的front-matter区域用来进行文章变量的指定:
1 2 3 4 5 --- title: model date: 2018-11-22 09:49:22 tags: ---
按照官方文档,front-matter用于文章变量的指定,本身不会作为markdown被解析,所以模板布局可以分为两个部分:
1 2 3 4 --- {{front-matter}} --- {{markdown}}
接下来就是对文章元素的提取并且转化成相应的hexo文章内容,并且拼接填充至模板当中了。
这里我简单封装了一个MarkdownGenerator类用来把文章转换成markdown文件:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 namespace Root ;use GuzzleHttp \Client ;use Symfony \Component \DomCrawler \Crawler ;Class MarkdownGenerator { private $client ; private $crawler ; private $url ; private $assetPath ; private $contentsArray = []; private $categories = []; private $tags = []; private $title ; private $dateString ; private $documentName ; public function __construct ($documentName ) { $this ->documentName = $documentName ; $this ->assetPath = __DIR__ . "/{$this->documentName} /" ; if (!is_dir($this ->assetPath)){ mkdir($this ->assetPath, 755 ); } $headers = [ 'user-agent' => 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' , ]; $this ->client = new Client([ 'timeout' => 20 , 'headers' => $headers ]); $this ->crawler = new Crawler(); } public function setUrl ($url = '' ) { $this ->url = $url ; } public function setTags ($tags = [] ) { $this ->tags = $tags ; } public function setCategories ($categories = [] ) { $this ->categories = $categories ; } public function generate ( ) { $responseContent = $this ->client->request('GET' , $this ->url)->getBody()->getContents(); $this ->crawler->addHtmlContent($responseContent ); try { $this ->title = trim($this ->crawler->filterXPath('//h1[@class="postTitle"]' )->text()); $this ->dateString = trim($this ->crawler->filterXPath('//span[@id="post-date"]' )->text()); $this ->crawler->filterXPath('//div[@id="cnblogs_post_body"]' )->children()->each(function (Crawler $node ) { $this ->contentsArray[] = $this ->parseParagraph($node ); }); $markdownContent = $this ->makeContent(); $frontMatter = $this ->makeFrontMatter(); $tmplate = file_get_contents('model.md' ); $content = str_replace(['{{front-matter}}' ,'{{markdown}}' ],[$frontMatter , $markdownContent ], $tmplate ); file_put_contents($this ->documentName.'.md' , $content ); }catch (\Throwable $e ){ print_r($e ->getMessage()); } } private function parseParagraph (Crawler $node ) { $res = $node ->html(); $linkPattern = '#<a\b[^>]+\bhref=\"([^\"]*)\"[^>]*>([\s\S]*?)<\/a>#' ; preg_match_all($linkPattern , $res , $links ); if (!empty ($links [0 ])){ foreach ($links [2 ] as $k => $link ){ $l = "[{$link} ]({$links[1][$k]} )" ; $res = str_replace($links [0 ][$k ], $l , $res ); } } $imgPattern = '#<img\b[^>]+\bsrc="([^"]*)"[^>]+\balt="([^"]*)"[^>]*>#' ; preg_match_all($imgPattern , $res , $imgs ); if (!empty ($imgs [0 ])){ foreach ($imgs [2 ] as $k => $img ){ $imageUrl = $imgs [1 ][$k ]; $imageName = pathinfo($imageUrl )['basename' ]; $fileName = $this ->assetPath . $imageName ; $image = $this ->client->get($imageUrl )->getBody()->getContents(); file_put_contents($fileName , $image ); $i = "{% asset_img {$imageName} {$imgs[2][$k]} %}" ; $res = str_replace($imgs [0 ][$k ], $i , $res ); } } if ($node ->attr('class' ) === 'cnblogs_code' ){ $plainCodes = trim($node ->text()); $res = htmlspecialchars("```" ) ."\n{$plainCodes} \n" . htmlspecialchars("```" ); } return $res ; } private function makeContent ( ) { return (implode($this ->contentsArray, "\n\n" )); } private function makeFrontMatter ( ) { $res = <<<FM title: {$this ->title} date: {$this ->dateString} FM; if (!empty ($this ->categories)){ $res .= "\ncategories:\n- " . implode($this ->categories, "\n- " ); } if (!empty ($this ->tags)){ $res .= "\ntags:\n- " . implode($this ->tags, "\n- " ); } return $res ; } }
接下来只用实例化generator类,然后设置各项属性,调用generate方法就能够抓取生成markdown文件了:
1 2 3 4 5 $generator = new MarkdownGenerator('test' );$generator ->setUrl('https://www.cnblogs.com/jackiebao/p/8466232.html' );$generator ->setTags(['test1' ,'test2' ]);$generator ->setCategories(['test_cat' ]);$generator ->generate();
P.S.本篇文章hexo版本即为该脚本生成。