Skip to content
This repository has been archived by the owner on Jan 22, 2020. It is now read-only.

Implemented the Rendler C++ Framework using V1 API. #45

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions cpp/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,22 @@ CXXCOMPILE = $(CXX) $(INCLUDES) $(CXXFLAGS) -c -o $@
CXXLINK = $(CXX) $(INCLUDES) $(CXXFLAGS) -o $@

default: all
all: rendler crawl_executor render_executor
all: rendler crawl_executor render_executor v1

HEADERS = rendler_helper.hpp

HEADERS_V1 = $(HEADERS) rendler_v1_executor.hpp

crawl_executor: crawl_executor.cpp $(HEADERS)
$(CXXLINK) $< $(LDFLAGS) -lboost_regex -lcurl

rendler_v1_executor.o: rendler_v1_executor.cpp $(HEADERS_V1)
$(CXXLINK) -c $<

%_v1_executor: %_v1_executor.cpp rendler_v1_executor.o $(HEADERS_V1)
$(CXXLINK) $< rendler_v1_executor.o $(LDFLAGS) -lboost_regex -lcurl

v1: rendler_v1 crawl_v1_executor render_v1_executor

%: %.cpp $(HEADERS)
$(CXXLINK) $< $(LDFLAGS)

Expand Down
165 changes: 165 additions & 0 deletions cpp/crawl_v1_executor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <iostream>
#include <vector>

#include <boost/regex.hpp>

#include <curl/curl.h>

#include <stout/os.hpp>

#include "rendler_helper.hpp"
#include "rendler_v1_executor.hpp"

using std::cout;
using std::endl;
using std::vector;

using mesos::vectorToString;


static int writer(char *data, size_t size, size_t nmemb, string *writerData)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need this helper? This helper seems pretty trivial?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is passed as a function pointer to curl_easy_setopt in line 72. Can be removed if we use curl instead.

{
assert(writerData != NULL);
writerData->append(data, size*nmemb);
return size * nmemb;
}


class CrawlV1Executor : public RendlerV1Executor
{
public:
CrawlV1Executor(const FrameworkID& _frameworkId,
const ExecutorID& _executorId)
: RendlerV1Executor("CrawlV1Executor", _frameworkId, _executorId) {}

virtual ~CrawlV1Executor() {}

protected:
void runTask(const TaskInfo& task) override
{
string url = task.data();
cout << "Running crawl task " << task.task_id().value()
<< " Fetch: " << url;

string buffer;
vector<string> result;
result.push_back(task.task_id().value());
result.push_back(url);

CURL *conn;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any specific reason we are using libcurl here? We generally have been using curl directly because of the pointer semantics required for libcurl. @jieyu is building a wrapper for libcurl in stout that will help us avoid the pointer semantics. I would just just use regular curl till the stout version of libcurl is available.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's also from the original code. Sure I'll use curl instead.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah !! thought so. Thanks

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm assuming you're talking about running curl through system(). But in addition to the web contents, we also need libcurl to obtain the redirected URL so we can recover the absolute URL of a link with relative path.

conn = curl_easy_init();
assert(conn != NULL);
assert(curl_easy_setopt(conn, CURLOPT_URL, url.c_str()) == CURLE_OK);
assert(curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L) == CURLE_OK);
assert(curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer) == CURLE_OK);
assert(curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer) == CURLE_OK);

if (curl_easy_perform(conn) != CURLE_OK) {
return;
}

char *tmp;
assert(curl_easy_getinfo(conn, CURLINFO_EFFECTIVE_URL, &tmp) == CURLE_OK);
string redirectUrl = url;
if (tmp != NULL) {
redirectUrl = tmp;
}
curl_easy_cleanup(conn);

size_t scheme = redirectUrl.find_first_of("://");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why can't you use http::URL from libprocess/process/http.hpp?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can use process::URL. But then I'll need to write some other code snippets to reconstruct the base URL (for checking the domains) and dir URL (for reconstructing URLs from relative paths). The input URL may has either IP or domain name, may or may not include a port number, and may have other variants. Not sure if it worths the effort.

size_t sp = redirectUrl.find_first_of('/',
scheme == string::npos ? 0 : scheme + 3); // skip the http:// part.
size_t lsp = redirectUrl.find_last_of('/'); // skip the http:// part.
string baseUrl = redirectUrl.substr(0, sp); // No trailing slash.
string dirUrl = redirectUrl.substr(0, lsp); // No trailing slash.

cout << "redirectUrl " << redirectUrl << " baseUrl: " << baseUrl << endl;
cout << "dirUrl " << dirUrl << endl;

const boost::regex hrefRE("<a\\s+[^\\>]*?href\\s*=\\s*([\"'])(.*?)\\1");
const boost::regex urlRE("^([a-zA-Z]+://).*");

boost::smatch matchHref;
string::const_iterator f = buffer.begin();
string::const_iterator l = buffer.end();

while (f != buffer.end() &&
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can do

foreach ( const string& f, buffer) {
      if (boost...) {
      }
}

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The boost::regex_search call in the next line requires iterator 'f'.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right. But I missed another point: buffer itself is a single string containing the whole page contents. In each iteration of this loop, it looks for the next match of the RE of hyperlink anchors, and move 'f' forward to the end of the matched anchor. So it's not just iterating through a collection of strings.

boost::regex_search(f, l, matchHref, hrefRE)) {
string link = matchHref[2];
f = matchHref[0].second;

boost::smatch matchService;
string::const_iterator lb = link.begin();
string::const_iterator le = link.end();

// Remove the anchor
if (link.find_first_of('#') != string::npos) {
link.erase(link.find_first_of('#'));
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a newline.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


if (link.empty()) {
continue;
}

if (link[0] == '/') {
link = baseUrl + link;
} else if (!boost::regex_search(lb, le, matchService, urlRE)) {
// Relative URL
link = dirUrl + "/" + link;
}
result.push_back(link);
};

sendFrameworkMessage(vectorToString(result));
sendStatusUpdate(task, TASK_FINISHED);
}
};


int main(int argc, char** argv)
{
FrameworkID frameworkId;
ExecutorID executorId;

Option<string> value;

value = os::getenv("MESOS_FRAMEWORK_ID");
if (value.isNone()) {
EXIT(EXIT_FAILURE)
<< "Expecting 'MESOS_FRAMEWORK_ID' to be set in the environment";
}
frameworkId.set_value(value.get());

value = os::getenv("MESOS_EXECUTOR_ID");
if (value.isNone()) {
EXIT(EXIT_FAILURE)
<< "Expecting 'MESOS_EXECUTOR_ID' to be set in the environment";
}
executorId.set_value(value.get());

process::Owned<CrawlV1Executor> crawler(
new CrawlV1Executor(frameworkId, executorId));

process::spawn(crawler.get());
process::wait(crawler.get());

return 0;
}
104 changes: 104 additions & 0 deletions cpp/render_v1_executor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <libgen.h>
#include <iostream>
#include <vector>

#include <boost/regex.hpp>

#include <curl/curl.h>

#include <stout/os.hpp>

#include "rendler_helper.hpp"
#include "rendler_v1_executor.hpp"

using std::cout;
using std::endl;
using std::vector;

using mesos::vectorToString;

static string renderJSPath;
static string workDirPath;


class RenderV1Executor : public RendlerV1Executor
{
public:
RenderV1Executor(const FrameworkID& _frameworkId,
const ExecutorID& _executorId)
: RendlerV1Executor("RenderV1Executor", _frameworkId, _executorId) {}

virtual ~RenderV1Executor() {}

protected:
void runTask(const TaskInfo& task) override
{
string url = task.data();
cout << "Running render task (" << task.task_id().value() << "): " << url;
string filename = workDirPath + task.task_id().value() + ".png";

vector<string> result;
result.push_back(task.task_id().value());
result.push_back(url);
result.push_back(filename);

string cmd = "QT_QPA_PLATFORM=offscreen phantomjs " + renderJSPath + " " + url + " " + filename;
assert(system(cmd.c_str()) != -1);

sendFrameworkMessage(vectorToString(result));
sendStatusUpdate(task, TASK_FINISHED);
}
};


int main(int argc, char** argv)
{
FrameworkID frameworkId;
ExecutorID executorId;

Option<string> value;

value = os::getenv("MESOS_FRAMEWORK_ID");
if (value.isNone()) {
EXIT(EXIT_FAILURE)
<< "Expecting 'MESOS_FRAMEWORK_ID' to be set in the environment";
}
frameworkId.set_value(value.get());

value = os::getenv("MESOS_EXECUTOR_ID");
if (value.isNone()) {
EXIT(EXIT_FAILURE)
<< "Expecting 'MESOS_EXECUTOR_ID' to be set in the environment";
}
executorId.set_value(value.get());

std::string path = os::realpath(::dirname(argv[0])).get();
renderJSPath = path + "/render.js";
workDirPath = path + "/rendler-work-dir/";

process::Owned<RenderV1Executor> renderer(
new RenderV1Executor(frameworkId, executorId));

process::spawn(renderer.get());
process::wait(renderer.get());

return 0;
}
1 change: 1 addition & 0 deletions cpp/rendler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ class Rendler : public Scheduler
for (size_t i = 0; i < offers.size(); i++) {
const Offer& offer = offers[i];
Resources remaining = offer.resources();
remaining.unallocate();

static Resources TASK_RESOURCES = Resources::parse(
"cpus:" + stringify<float>(CPUS_PER_TASK) +
Expand Down
Loading