try to parse given url before using embedded links

This commit is contained in:
Athou
2013-08-06 13:49:03 +02:00
parent 2bff335698
commit a72e08c0c6

View File

@@ -36,15 +36,26 @@ public class FeedFetcher {
FetchedFeed fetchedFeed = null; FetchedFeed fetchedFeed = null;
int timeout = 20000; int timeout = 20000;
HttpResult result = getter.getBinary(feedUrl, lastModified, eTag, timeout); HttpResult result = getter.getBinary(feedUrl, lastModified, eTag, timeout);
if (extractFeedUrlFromHtml) { byte[] content = result.getContent();
String extractedUrl = extractFeedUrl(StringUtils.newStringUtf8(result.getContent()), feedUrl);
if (org.apache.commons.lang.StringUtils.isNotBlank(extractedUrl)) { try {
result = getter.getBinary(extractedUrl, lastModified, eTag, timeout); fetchedFeed = parser.parse(feedUrl, content);
feedUrl = extractedUrl; } catch (FeedException e) {
if (extractFeedUrlFromHtml) {
String extractedUrl = extractFeedUrl(StringUtils.newStringUtf8(result.getContent()), feedUrl);
if (org.apache.commons.lang.StringUtils.isNotBlank(extractedUrl)) {
feedUrl = extractedUrl;
result = getter.getBinary(extractedUrl, lastModified, eTag, timeout);
content = result.getContent();
fetchedFeed = parser.parse(feedUrl, content);
}
} else {
throw e;
} }
} }
byte[] content = result.getContent();
if (content == null) { if (content == null) {
throw new IOException("Feed content is empty."); throw new IOException("Feed content is empty.");
@@ -56,8 +67,6 @@ public class FeedFetcher {
throw new NotModifiedException("content hash not modified"); throw new NotModifiedException("content hash not modified");
} }
fetchedFeed = parser.parse(feedUrl, content);
if (lastPublishedDate != null && fetchedFeed.getFeed().getLastPublishedDate() != null if (lastPublishedDate != null && fetchedFeed.getFeed().getLastPublishedDate() != null
&& lastPublishedDate.getTime() == fetchedFeed.getFeed().getLastPublishedDate().getTime()) { && lastPublishedDate.getTime() == fetchedFeed.getFeed().getLastPublishedDate().getTime()) {
log.debug("publishedDate not modified: {}", feedUrl); log.debug("publishedDate not modified: {}", feedUrl);