From 3650b18b3e800d86937e3209e7eead8872978d01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Mon, 15 Jul 2024 20:24:44 +0200 Subject: [PATCH 01/72] Stream rewrite URLs in a remote WXR file Brings together a few explorations to stream-rewrite site URLs in a WXR file coming from a remote server. All of that with no curl, DOMDocument, or other PHP dependencies. It's just a few small libraries built with WordPress core in mind: * [AsyncHttp\Client](https://github.com/WordPress/blueprints/pull/52) * [WP_XML_Processor](https://github.com/WordPress/wordpress-develop/pull/6713) * [WP_Block_Markup_Url_Processor](https://github.com/adamziel/site-transfer-protocol) * [WP_HTML_Tag_Processor](https://developer.wordpress.org/reference/classes/wp_html_tag_processor/) Here's what the rewriter looks like: ```php $wxr_url = "https://raw.githubusercontent.com/WordPress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/woo-products.wxr"; $xml_processor = new WP_XML_Processor('', [], WP_XML_Processor::IN_PROLOG_CONTEXT); foreach( stream_remote_file( $wxr_url ) as $chunk ) { $xml_processor->stream_append_xml($chunk); foreach ( xml_next_content_node_for_rewriting( $xml_processor ) as $text ) { $string_new_site_url = 'https://mynew.site/'; $parsed_new_site_url = WP_URL::parse( $string_new_site_url ); $current_site_url = 'https://raw.githubusercontent.com/wordpress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/wxr-assets/'; $parsed_current_site_url = WP_URL::parse( $current_site_url ); $base_url = 'https://playground.internal'; $url_processor = new WP_Block_Markup_Url_Processor( $text, $base_url ); foreach ( html_next_url( $url_processor, $current_site_url ) as $parsed_matched_url ) { $updated_raw_url = rewrite_url( $url_processor->get_raw_url(), $parsed_matched_url, $parsed_current_site_url, $parsed_new_site_url ); $url_processor->set_raw_url( $updated_raw_url ); } $updated_text = $url_processor->get_updated_html(); if ($updated_text !== $text) { $xml_processor->set_modifiable_text($updated_text); } } echo $xml_processor->get_processed_xml(); } echo $xml_processor->get_unprocessed_xml(); ``` --- bootstrap.php | 53 +++++ class-wp-html-tag-processor.php | 296 +++++++++++++++++++-------- class-wp-wxr-normalizer.php | 236 ++++++++++++++++++++++ class-wp-xml-processor.php | 22 +++ class-wp-xml-tag-processor.php | 29 ++- functions.php | 65 ++++++ rewrite-remote-wxr.php | 175 ++++++++++++++++ rewrite-wxr.php | 341 +------------------------------- site-transfer-protocol | 1 + test-data/woo-products.wxr | 1 - 10 files changed, 790 insertions(+), 429 deletions(-) create mode 100644 bootstrap.php create mode 100644 class-wp-wxr-normalizer.php create mode 100644 functions.php create mode 100644 rewrite-remote-wxr.php create mode 160000 site-transfer-protocol diff --git a/bootstrap.php b/bootstrap.php new file mode 100644 index 0000000..1764edc --- /dev/null +++ b/bootstrap.php @@ -0,0 +1,53 @@ +' )`. + * style of including JavaScript inside of HTML comments to avoid accidentally + * closing the SCRIPT from inside a JavaScript string. E.g. `console.log( '' )`. * - `TITLE` and `TEXTAREA` whose contents are treated as plaintext and then any * character references are decoded. E.g. `1 < 2 < 3` becomes `1 < 2 < 3`. * - `IFRAME`, `NOSCRIPT`, `NOEMBED`, `NOFRAME`, `STYLE` whose contents are treated as @@ -1524,21 +1524,10 @@ private function parse_next_tag() { $was_at = $this->bytes_already_parsed; $at = $was_at; - while ( false !== $at && $at < $doc_length ) { + while ( $at < $doc_length ) { $at = strpos( $html, '<', $at ); - - /* - * This does not imply an incomplete parse; it indicates that there - * can be nothing left in the document other than a #text node. - */ if ( false === $at ) { - $this->parser_state = self::STATE_TEXT_NODE; - $this->token_starts_at = $was_at; - $this->token_length = strlen( $html ) - $was_at; - $this->text_starts_at = $was_at; - $this->text_length = $this->token_length; - $this->bytes_already_parsed = strlen( $html ); - return true; + break; } if ( $at > $was_at ) { @@ -1554,19 +1543,9 @@ private function parse_next_tag() { * * @see https://html.spec.whatwg.org/#tag-open-state */ - if ( strlen( $html ) > $at + 1 ) { - $next_character = $html[ $at + 1 ]; - $at_another_node = ( - '!' === $next_character || - '/' === $next_character || - '?' === $next_character || - ( 'A' <= $next_character && $next_character <= 'Z' ) || - ( 'a' <= $next_character && $next_character <= 'z' ) - ); - if ( ! $at_another_node ) { - ++$at; - continue; - } + if ( 1 !== strspn( $html, '!/?abcdefghijklmnopqrstuvwxyzABCEFGHIJKLMNOPQRSTUVWXYZ', $at + 1, 1 ) ) { + ++$at; + continue; } $this->parser_state = self::STATE_TEXT_NODE; @@ -1630,11 +1609,7 @@ private function parse_next_tag() { * `') !== false || + strpos($new_value, '--!>') !== false + ) + ) { + _doing_it_wrong( + __METHOD__, + __( 'Cannot set a comment closer as a text of an HTML comment.' ), + 'WP_VERSION' + ); + return false; + } + if( + $p->get_token_type() === '#cdata-section' && + strpos($new_value, '>') !== false + ) { + _doing_it_wrong( + __METHOD__, + __( 'Cannot set a CDATA closer as text of an HTML CDATA-lookalike section.' ), + 'WP_VERSION' + ); + return false; + } + $lexical_updates_now = $lexical_updates->getValue($p); + $lexical_updates_now[] = new WP_HTML_Text_Replacement( + $accessible_text_starts_at->getValue($p), + $accessible_text_length->getValue($p), + $new_value + ); + $lexical_updates->setValue($p, $lexical_updates_now); + return true; + default: + _doing_it_wrong( + __METHOD__, + __( 'Cannot set text content on a non-text node.' ), + 'WP_VERSION' + ); + return false; + } + } +} diff --git a/class-wp-xml-processor.php b/class-wp-xml-processor.php index a18e35f..9156f8f 100644 --- a/class-wp-xml-processor.php +++ b/class-wp-xml-processor.php @@ -88,6 +88,28 @@ public static function stream_tokens( $input_stream, $output_stream, $buffer_siz } } + /** + * Wipes out the processed XML and appends the next chunk of XML to + * any remaining unprocessed XML. + * + * @param string $next_chunk XML to append. + */ + public function stream_append_xml( $next_chunk ) + { + $this->get_updated_xml(); + + $new_xml = $this->get_unprocessed_xml() . $next_chunk; + $breadcrumbs = $this->get_breadcrumbs(); + $parser_context = $this->get_parser_context(); + + $this->reset_state(); + + $this->xml = $new_xml; + $this->stack_of_open_elements = $breadcrumbs; + $this->parser_context = $parser_context; + $this->had_previous_chunks = true; + } + /** * Constructor. * diff --git a/class-wp-xml-tag-processor.php b/class-wp-xml-tag-processor.php index db06e54..45ea8f4 100644 --- a/class-wp-xml-tag-processor.php +++ b/class-wp-xml-tag-processor.php @@ -337,7 +337,7 @@ class WP_XML_Tag_Processor { * @since WP_VERSION * @var string */ - protected $xml; + public $xml; /** * The last query passed to next_tag(). @@ -428,7 +428,7 @@ class WP_XML_Tag_Processor { * @since WP_VERSION * @var int */ - private $bytes_already_parsed = 0; + public $bytes_already_parsed = 0; /** * Byte offset in input document where current token starts. @@ -1751,6 +1751,31 @@ private function after_tag() { $this->is_closing_tag = null; $this->attributes = array(); } + + protected function reset_state() + { + $this->xml = ''; + $this->last_query = null; + $this->sought_tag_name = null; + $this->sought_match_offset = 0; + $this->stop_on_tag_closers = false; + $this->parser_state = self::STATE_READY; + $this->is_incomplete_text_node = false; + $this->bytes_already_parsed = 0; + $this->token_starts_at = null; + $this->token_length = null; + $this->tag_name_starts_at = null; + $this->tag_name_length = null; + $this->text_starts_at = null; + $this->text_length = null; + $this->is_closing_tag = null; + $this->last_error = null; + $this->attributes = array(); + $this->bookmarks = array(); + $this->lexical_updates = array(); + $this->seek_count = 0; + $this->had_previous_chunks = false; + } /** * Applies attribute updates to XML document. diff --git a/functions.php b/functions.php new file mode 100644 index 0000000..3373d43 --- /dev/null +++ b/functions.php @@ -0,0 +1,65 @@ + $local_file) { + $request = new Request($asset_url); + $requests[] = $request; + $local_paths[$request->id] = $local_file; + } + + $client = new Client( [ + 'concurrency' => 10, + ] ); + $client->enqueue( $requests ); + + $results = []; + while ( $client->await_next_event() ) { + $request = $client->get_request(); + + switch ( $client->get_event() ) { + case Client::EVENT_BODY_CHUNK_AVAILABLE: + file_put_contents( + $local_paths[$request->original_request()->id], + $client->get_response_body_chunk(), + FILE_APPEND + ); + break; + case Client::EVENT_FAILED: + $results[$request->original_request()->url] = [ + 'success' => false, + 'error' => $request->error, + ]; + break; + case Client::EVENT_FINISHED: + $results[$request->original_request()->url] = [ + 'success' => true + ]; + break; + } + } + return $results; +} + +/** + * WordPress compat + */ +if(!function_exists('esc_attr')) { + function esc_attr($text) { + return htmlspecialchars($text, ENT_XML1, 'UTF-8'); + } +} + +function serialize_url($parsedUrl) { + return (isset($parsedUrl['scheme']) ? $parsedUrl['scheme'] . '://' : '') + . (isset($parsedUrl['user']) ? $parsedUrl['user'] . (isset($parsedUrl['pass']) ? ':' . $parsedUrl['pass'] : '') .'@' : '') + . $parsedUrl['host'] + . (isset($parsedUrl['port']) ? ':' . $parsedUrl['port'] : '') + . (isset($parsedUrl['path']) ? $parsedUrl['path'] : '') + . (isset($parsedUrl['query']) ? '?' . $parsedUrl['query'] : '') + . (isset($parsedUrl['fragment']) ? '#' . $parsedUrl['fragment'] : ''); +} diff --git a/rewrite-remote-wxr.php b/rewrite-remote-wxr.php new file mode 100644 index 0000000..b57e6ca --- /dev/null +++ b/rewrite-remote-wxr.php @@ -0,0 +1,175 @@ + WP_XML_Processor -> WP_Block_Markup_Url_Processor -> WP_Migration_URL_In_Text_Processor -> WP_URL + * + * The layers of data we're handling here are: + * + * * AsyncHttp\Client: HTTPS encrypted data -> Chunked encoding -> Gzip compression + * * WP_XML_Processor: XML (entities, attributes, text, comments, CDATA nodes) + * * WP_Block_Markup_Url_Processor: HTML (entities, attributes, text, comments, block comments), JSON (in block comments) + * * WP_Migration_URL_In_Text_Processor: URLs in text nodes + * * WP_URL: URL parsing and serialization + * + * It wouldn't be difficult to pipe through additioanl layers such as: + * + * * Reading from a remote ZIP file + * * Writing to a local ZIP-ped XML file + * * Writing to a database + * + * ...etc. + */ + +require __DIR__ . '/bootstrap.php'; + +use \WordPress\AsyncHttp\Client; +use \WordPress\AsyncHttp\Request; + +$wxr_url = "https://raw.githubusercontent.com/WordPress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/woo-products.wxr"; +$xml_processor = new WP_XML_Processor('', [], WP_XML_Processor::IN_PROLOG_CONTEXT); +foreach( stream_remote_file( $wxr_url ) as $chunk ) { + $xml_processor->stream_append_xml($chunk); + foreach ( xml_next_content_node_for_rewriting( $xml_processor ) as $text ) { + $string_new_site_url = 'https://mynew.site/'; + $parsed_new_site_url = WP_URL::parse( $string_new_site_url ); + + $current_site_url = 'https://raw.githubusercontent.com/wordpress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/wxr-assets/'; + $parsed_current_site_url = WP_URL::parse( $current_site_url ); + + $base_url = 'https://playground.internal'; + $url_processor = new WP_Block_Markup_Url_Processor( $text, $base_url ); + + foreach ( html_next_url( $url_processor, $current_site_url ) as $parsed_matched_url ) { + $updated_raw_url = rewrite_url( + $url_processor->get_raw_url(), + $parsed_matched_url, + $parsed_current_site_url, + $parsed_new_site_url + ); + $url_processor->set_raw_url( $updated_raw_url ); + } + + $updated_text = $url_processor->get_updated_html(); + if ($updated_text !== $text) { + $xml_processor->set_modifiable_text($updated_text); + } + } + echo $xml_processor->get_processed_xml(); +} +echo $xml_processor->get_unprocessed_xml(); + +// The rest of this file are functions used in the above code + +function stream_remote_file($url) +{ + $requests = [ + new Request($url) + ]; + $client = new Client(); + $client->enqueue($requests); + + while ($client->await_next_event()) { + switch ($client->get_event()) { + case Client::EVENT_BODY_CHUNK_AVAILABLE: + yield $client->get_response_body_chunk(); + break; + } + } +} + +function xml_next_content_node_for_rewriting(WP_XML_Processor $processor) { + while($processor->next_token()) { + if (!in_array('item', $processor->get_breadcrumbs())) { + continue; + } + if ( + !in_array('excerpt:encoded', $processor->get_breadcrumbs()) + && !in_array('content:encoded', $processor->get_breadcrumbs()) + && !in_array('wp:attachment_url', $processor->get_breadcrumbs()) + && !in_array('guid', $processor->get_breadcrumbs()) + && !in_array('link', $processor->get_breadcrumbs()) + && !in_array('wp:comment_content', $processor->get_breadcrumbs()) + // Meta values are not suppoerted yet. We'll need to support + // WordPress core options that may be saved as JSON, PHP Deserialization, and XML, + // and then provide extension points for plugins authors support + // their own options. + // !in_array('wp:postmeta', $processor->get_breadcrumbs()) + ) { + continue; + } + + switch ($processor->get_token_type()) { + case '#text': + case '#cdata-section': + $text = $processor->get_modifiable_text(); + yield $text; + break; + } + } +} + +/** + * + * @param mixed $options + * @return Generator + */ +function html_next_url(WP_Block_Markup_Url_Processor $p, $current_site_url) { + $parsed_current_site_url = WP_URL::parse( $current_site_url ); + $decoded_current_site_pathname = urldecode( $parsed_current_site_url->pathname ); + + while ( $p->next_url() ) { + $parsed_matched_url = $p->get_parsed_url(); + if ( $parsed_matched_url->hostname === $parsed_current_site_url->hostname ) { + $decoded_matched_pathname = urldecode( $parsed_matched_url->pathname ); + $pathname_matches = str_starts_with( $decoded_matched_pathname, $decoded_current_site_pathname ); + if ( ! $pathname_matches ) { + continue; + } + + // It's a match! + yield $parsed_matched_url; + } + } +} + +function rewrite_url( + string $raw_matched_url, + $parsed_matched_url, + $parsed_current_site_url, + $parsed_new_site_url, +) { + // Let's rewrite the URL + $parsed_matched_url->hostname = $parsed_new_site_url->hostname; + $decoded_matched_pathname = urldecode( $parsed_matched_url->pathname ); + + // Short-circuit for empty pathnames + if ('/' !== $parsed_current_site_url->pathname) { + $parsed_matched_url->pathname = + $parsed_new_site_url->pathname . + substr( + $decoded_matched_pathname, + strlen(urldecode($parsed_current_site_url->pathname)) + ); + } + + /* + * Stylistic choice – if the matched URL has no trailing slash, + * do not add it to the new URL. The WHATWG URL parser will + * add one automatically if the path is empty, so we have to + * explicitly remove it. + */ + $new_raw_url = $parsed_matched_url->toString(); + if ( + $raw_matched_url[strlen($raw_matched_url) - 1] !== '/' && + $parsed_matched_url->pathname === '/' && + $parsed_matched_url->search === '' && + $parsed_matched_url->hash === '' + ) { + $new_raw_url = rtrim($new_raw_url, '/'); + } + + return $new_raw_url; +} diff --git a/rewrite-wxr.php b/rewrite-wxr.php index 3bfa0d1..f1fe965 100644 --- a/rewrite-wxr.php +++ b/rewrite-wxr.php @@ -25,51 +25,8 @@ * [2] ZipStreamWriter: https://github.com/WordPress/blueprints-library/blob/f9fcb5816ab6def0920b25787341342bc88803e3/src/WordPress/Zip/ZipStreamWriter.php * [3] AsyncHttpClient: https://github.com/WordPress/blueprints-library/blob/trunk/src/WordPress/AsyncHttp/Client.php */ - -use \WordPress\AsyncHttp\Client; -use \WordPress\AsyncHttp\Request; -// Where to find the streaming WP_XML_Processor -// Use a version from this PR: https://github.com/adamziel/wordpress-develop/pull/43 -define('WP_XML_API_PATH', __DIR__ ); -define('BLUEPRINTS_LIB_PATH', __DIR__ . '/blueprints-library/src/WordPress' ); -if(!file_exists(WP_XML_API_PATH . '/class-wp-token-map.php')) { - copy(WP_XML_API_PATH.'/../class-wp-token-map.php', WP_XML_API_PATH . '/class-wp-token-map.php'); -} - -$requires[] = WP_XML_API_PATH . "/class-wp-html-token.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-span.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-text-replacement.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-decoder.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-attribute-token.php"; - -$requires[] = WP_XML_API_PATH . "/class-wp-html-decoder.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-tag-processor.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-open-elements.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-token-map.php"; -$requires[] = WP_XML_API_PATH . "/html5-named-character-references.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-active-formatting-elements.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-processor-state.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-unsupported-exception.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-processor.php"; - -$requires[] = WP_XML_API_PATH . "/class-wp-xml-decoder.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-xml-tag-processor.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-xml-processor.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/Streams/StreamWrapperInterface.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/Streams/StreamWrapper.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/Streams/StreamPeekerWrapper.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Request.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Response.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/HttpError.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Connection.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Client.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/StreamWrapper/ChunkedEncodingWrapper.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/StreamWrapper/InflateStreamWrapper.php"; - -foreach ($requires as $require) { - require_once $require; -} +require __DIR__ . '/bootstrap.php'; if (!Phar::running() && in_array('--bundle', $argv)) { bundlePhar('preprocess-wxr.phar', array_merge( @@ -198,7 +155,7 @@ function ($url) { return $url; } $url_to_path[$url] = $details['download_path']; } } -wp_download_files([ +wxr_download_files([ 'concurrency' => 10, 'assets' => $url_to_path ]); @@ -227,297 +184,3 @@ function ($url) use($assets_details) { $normalizer->process(); fclose($input_stream); fclose($output_stream); - -function wp_download_files($options) { - $requests = []; - $local_paths = []; - foreach ($options['assets'] as $asset_url => $local_file) { - $request = new Request($asset_url); - $requests[] = $request; - $local_paths[$request->id] = $local_file; - } - - $client = new Client( [ - 'concurrency' => 10, - ] ); - $client->enqueue( $requests ); - - $results = []; - while ( $client->await_next_event() ) { - $request = $client->get_request(); - - switch ( $client->get_event() ) { - case Client::EVENT_BODY_CHUNK_AVAILABLE: - file_put_contents( - $local_paths[$request->original_request()->id], - $client->get_response_body_chunk(), - FILE_APPEND - ); - break; - case Client::EVENT_FAILED: - $results[$request->original_request()->url] = [ - 'success' => false, - 'error' => $request->error, - ]; - break; - case Client::EVENT_FINISHED: - $results[$request->original_request()->url] = [ - 'success' => true - ]; - break; - } - } - return $results; -} - -/** - * WordPress compat - */ -function esc_attr($text) { - return htmlspecialchars($text, ENT_XML1, 'UTF-8'); -} - -function serialize_url($parsedUrl) { - return (isset($parsedUrl['scheme']) ? $parsedUrl['scheme'] . '://' : '') - . (isset($parsedUrl['user']) ? $parsedUrl['user'] . (isset($parsedUrl['pass']) ? ':' . $parsedUrl['pass'] : '') .'@' : '') - . $parsedUrl['host'] - . (isset($parsedUrl['port']) ? ':' . $parsedUrl['port'] : '') - . (isset($parsedUrl['path']) ? $parsedUrl['path'] : '') - . (isset($parsedUrl['query']) ? '?' . $parsedUrl['query'] : '') - . (isset($parsedUrl['fragment']) ? '#' . $parsedUrl['fragment'] : ''); -} - -class WP_WXR_Normalizer -{ - - private $input_stream; - private $output_stream; - private $rewrite_url_callback; - - private $found_urls = array(); - - public function __construct( - $input_stream, - $output_stream, - $rewrite_url_callback - ) { - $this->input_stream = $input_stream; - $this->output_stream = $output_stream; - $this->rewrite_url_callback = $rewrite_url_callback; - } - - public function get_found_urls() - { - return array_keys($this->found_urls); - } - - public function process() - { - $tokens = WP_XML_Processor::stream_tokens($this->input_stream, $this->output_stream, 1000000); - foreach ($tokens as $processor) { - if ( - in_array('item', $processor->get_breadcrumbs()) - // $processor->matches_breadcrumbs(array('item', 'content:encoded')) || - // $processor->matches_breadcrumbs(array('item', 'excerpt:encoded')) || - // $processor->matches_breadcrumbs(array('wp:comment_content')) - ) { - switch ($processor->get_token_type()) { - case '#text': - case '#cdata-section': - $text = $processor->get_modifiable_text(); - $updated_text = $this->process_content_node($text); - if ($updated_text !== $text) { - $processor->set_modifiable_text($updated_text); - } - break; - } - } - } - } - - private function process_content_node($text) - { - $result = $this->process_as_html($text); - if(false !== $result) { - return $result; - } - - $result = $this->process_as_plaintext($text); - if(false !== $result) { - return $result; - } - - return false; - } - - private function process_as_html($text) { - $html = new WP_HTML_Tag_Processor($text); - if(false === $html->next_token()) { - return false; - } - - do { - switch($html->get_token_type()) { - case '#comment': - $text = $html->get_modifiable_text(); - // Try to parse as a block. The block parser won't cut it because - // while it can parse blocks, it has no semantics for rewriting the - // block markup. Let's do our best here: - $at = strspn($text, ' \t\f\r\n'); // Whitespace - if(!( - $at + 3 < strlen($text) && - $text[$at] === 'w' && - $text[$at+1] === 'p' && - $text[$at+2] === ':' - )) { - break; - } - $at += 3; - $at += strspn($text, 'abcdefghijklmnopqrstuwxvyzABCDEFGHIJKLMNOPRQSTUWXVYZ0123456789_-', $at); // Block name - $at += strspn($text, ' \t\f\r\n', $at); // Whitespace again - if($at >= strlen($text)) { - // Oh, there were no attributes or this wasn't a block - // Either way, we have nothing more to do here. - break; - } - - // It seems we may have block attributes here. Let's try to - // parse them as JSON. - $json_maybe = substr($text, $at); - $attributes = json_decode($json_maybe, true); - if(null === $attributes) { - // This wasn't a block after all, let's move on - break; - } - - // This is a block! Let's process all block attributes and rewrite them - $new_attributes = $this->process_block_attributes($attributes); - $this->set_modifiable_html_text( - $html, - substr($text, 0, $at) . json_encode($new_attributes, JSON_HEX_TAG | JSON_HEX_AMP) - ); - break; - - case '#tag': - $attributes = $html->get_attribute_names_with_prefix(''); - if(!$attributes) { - break; - } - foreach($attributes as $attribute_name) { - $value = $html->get_attribute($attribute_name); - $updated = $this->process_as_plaintext($value); - if($updated !== $value) { - $html->set_attribute($attribute_name, $updated); - } - } - break; - case '#text': - $text = $html->get_modifiable_text(); - $updated_text = $this->process_as_plaintext($text); - if($updated_text !== $text) { - $this->set_modifiable_html_text($html, $updated_text); - } - break; - } - } while($html->next_token()); - - return $html->get_updated_html(); - } - - private function process_block_attributes($attributes) { - if(is_string($attributes)) { - return $this->process_as_plaintext($attributes); - } else if(is_array($attributes)) { - $new_attributes = array(); - foreach($attributes as $key => $value) { - $new_attributes[$key] = $this->process_block_attributes($value); - } - return $new_attributes; - } else { - return $attributes; - } - } - - /** - * @TODO: Investigate how bad this is – would it stand the test of time, or do we need - * a proper URL-matching state machine? - */ - const URL_REGEXP = '\b((?:(https?):\/\/|www\.)[-a-zA-Z0-9@:%._\+\~#=]+(?:\.[a-zA-Z0-9]{2,})+[-a-zA-Z0-9@:%_\+.\~#?&//=]*)\b'; - private function process_as_plaintext($text) { - return preg_replace_callback( - '~'.self::URL_REGEXP.'~', - function ($matches) { - $this->found_urls[$matches[0]] = true; - $replacer = $this->rewrite_url_callback; - return $replacer($matches[0]); - }, - $text - ); - } - - private function set_modifiable_html_text(WP_HTML_Tag_Processor $p, $new_value) { - $reflection = new ReflectionClass('WP_HTML_Tag_Processor'); - $accessible_text_starts_at = $reflection->getProperty('text_starts_at'); - $accessible_text_starts_at->setAccessible(true); - - $accessible_text_length = $reflection->getProperty('text_length'); - $accessible_text_length->setAccessible(true); - - $lexical_updates = $reflection->getProperty('lexical_updates'); - $lexical_updates->setAccessible(true); - - switch ( $p->get_token_type() ) { - case '#text': - $lexical_updates_now = $lexical_updates->getValue($p); - $lexical_updates_now[] = new WP_HTML_Text_Replacement( - $accessible_text_starts_at->getValue($p), - $accessible_text_length->getValue($p), - htmlspecialchars( $new_value, ENT_XML1, 'UTF-8' ) - ); - $lexical_updates->setValue($p, $lexical_updates_now); - return true; - - case '#comment': - case '#cdata-section': - if( - $p->get_token_type() === '#comment' && ( - strpos($new_value, '-->') !== false || - strpos($new_value, '--!>') !== false - ) - ) { - _doing_it_wrong( - __METHOD__, - __( 'Cannot set a comment closer as a text of an HTML comment.' ), - 'WP_VERSION' - ); - return false; - } - if( - $p->get_token_type() === '#cdata-section' && - strpos($new_value, '>') !== false - ) { - _doing_it_wrong( - __METHOD__, - __( 'Cannot set a CDATA closer as text of an HTML CDATA-lookalike section.' ), - 'WP_VERSION' - ); - return false; - } - $lexical_updates_now = $lexical_updates->getValue($p); - $lexical_updates_now[] = new WP_HTML_Text_Replacement( - $accessible_text_starts_at->getValue($p), - $accessible_text_length->getValue($p), - $new_value - ); - $lexical_updates->setValue($p, $lexical_updates_now); - return true; - default: - _doing_it_wrong( - __METHOD__, - __( 'Cannot set text content on a non-text node.' ), - 'WP_VERSION' - ); - return false; - } - } -} diff --git a/site-transfer-protocol b/site-transfer-protocol new file mode 160000 index 0000000..3486d67 --- /dev/null +++ b/site-transfer-protocol @@ -0,0 +1 @@ +Subproject commit 3486d676a5fa2a76b719e3e6159a80e3013bed8a diff --git a/test-data/woo-products.wxr b/test-data/woo-products.wxr index 33937b9..034049d 100644 --- a/test-data/woo-products.wxr +++ b/test-data/woo-products.wxr @@ -2051,7 +2051,6 @@ open closed album - publish 0 0 product From d44f70131cab62423ce6db8c30eee13c8d39503a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 16 Jul 2024 11:48:28 +0200 Subject: [PATCH 02/72] Experiment with pipe interface --- pipes.php | 608 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 608 insertions(+) create mode 100644 pipes.php diff --git a/pipes.php b/pipes.php new file mode 100644 index 0000000..4576a74 --- /dev/null +++ b/pipes.php @@ -0,0 +1,608 @@ +finished || $this->error ) { + return false; + } + + return $this->doRead(); + } + + abstract protected function doRead(): bool; + + public function is_finished(): bool { + return $this->finished; + } + + public function get_output(): ?string { + if ( $this->buffer !== '' ) { + $data = $this->buffer; + $this->buffer = ''; + + return $data; + } + + return null; + } + + protected function set_error( string $error ) { + $this->error = $error ?: 'unknown error'; + $this->finished = true; + } + + public function get_error(): ?string { + return $this->error; + } +} + +interface WritableStream { + public function write( string $data ): bool; + + public function needs_more(): bool; + + public function get_error(): ?string; +} + +interface TransformStream extends ReadableStream, WritableStream { +} + +trait BaseWritableStream { + protected $error = null; + + public function write( string $data ): bool { + if ( $this->error ) { + return false; + } + + return $this->doWrite( $data ); + } + + abstract protected function doWrite( string $data ): bool; + + public function needs_more(): bool { + return true; + } + + protected function set_error( string $error ) { + $this->error = $error ?: 'unknown error'; + $this->finished = true; + } + + public function get_error(): ?string { + return $this->error; + } +} + +class BufferStream implements WritableStream { + + use BaseWritableStream; + + private $buffer = ''; + + protected function doWrite( string $data ): bool { + $this->buffer .= $data; + + return true; + } + + public function get_output(): ?string { + return $this->buffer; + } + +} + +trait BaseTransformStream { + use BaseReadableStream, BaseWritableStream { + BaseReadableStream::get_error insteadof BaseWritableStream; + BaseReadableStream::set_error insteadof BaseWritableStream; + } +} + +class BlockMarkupURLVisitorStream implements ReadableStream { + + use BaseReadableStream; + + private $url_processor; + private $url_visitor_callback; + + public function __construct( $url_processor, $url_visitor_callback ) { + $this->url_processor = $url_processor; + $this->url_visitor_callback = $url_visitor_callback; + } + + protected function doRead(): bool { + $processor = $this->url_processor; + while ( $processor->next_url() ) { + $url_visitor_callback = $this->url_visitor_callback; + $url_visitor_callback( $processor ); + } + + // The processor was initialized with a complete HTML string + // so we can be sure the processing is finished at this point. + // This class could also support streaming processing of HTML, + // but for WXR processing that's not needed. + $this->finished = true; + $this->buffer .= $processor->get_updated_html(); + + return strlen( $this->buffer ) > 0; + } + +} + +class XMLProcessorStream implements TransformStream { + use BaseTransformStream; + + private $xml_processor; + private $node_visitor_callback; + + public function __construct( $node_visitor_callback ) { + $this->xml_processor = new WP_XML_Processor( '', [], WP_XML_Processor::IN_PROLOG_CONTEXT ); + $this->node_visitor_callback = $node_visitor_callback; + } + + protected function doWrite( string $data ): bool { + $this->xml_processor->stream_append_xml( $data ); + + return true; + } + + protected function doRead(): bool { + $processor = $this->xml_processor; + if ( $processor->paused_at_incomplete_token() ) { + return false; + } + + if ( $processor->get_last_error() ) { + $this->set_error( $processor->get_last_error() ); + + return false; + } + + $tokens_found = 0; + while ( $processor->next_token() ) { + ++ $tokens_found; + $node_visitor_callback = $this->node_visitor_callback; + $node_visitor_callback( $processor ); + } + + if ( $tokens_found > 0 ) { + $this->buffer .= $processor->get_updated_xml(); + } else { + $this->buffer .= $processor->get_unprocessed_xml(); + $this->finished = true; + } + + return strlen( $this->buffer ) > 0; + } + +} + +class RequestStream implements ReadableStream { + use BaseReadableStream; + + private $client; + + public function __construct( Request $request ) { + $this->client = new Client(); + $this->client->enqueue( [ $request ] ); + } + + protected function doRead(): bool { + if ( ! $this->client->await_next_event() ) { + $this->finished = true; + + return false; + } + + switch ( $this->client->get_event() ) { + case Client::EVENT_BODY_CHUNK_AVAILABLE: + $this->buffer .= $this->client->get_response_body_chunk(); + + return true; + case Client::EVENT_FAILED: + $this->set_error( $this->client->get_request()->error ?: 'unknown error' ); + break; + case Client::EVENT_FINISHED: + $this->finished = true; + break; + } + + return false; + } + +} + +class UppercaseTransformer implements ReadableStream, WritableStream { + private $data = ''; + private $finished = false; + private $error = null; + + public function read(): bool { + return ! empty( $this->data ); + } + + public function write( string $data ): bool { + $this->data .= strtoupper( $data ); + + return true; + } + + public function needs_more(): bool { + return ! $this->finished; + } + + public function is_finished(): bool { + return $this->finished; + } + + public function get_output(): ?string { + if ( $this->data !== '' ) { + $data = $this->data; + $this->data = ''; + + return $data; + } + + return null; + } + + public function get_error(): ?string { + return $this->error; + } + + public function set_finished() { + $this->finished = true; + } +} + +class Rot13Transformer implements ReadableStream, WritableStream { + private $data = ''; + private $finished = false; + private $error = null; + + public function read(): bool { + return ! empty( $this->data ); + } + + public function write( string $data ): bool { + $this->data .= str_rot13( $data ); + + return true; + } + + public function needs_more(): bool { + return ! $this->finished; + } + + public function is_finished(): bool { + return $this->finished; + } + + public function get_output(): ?string { + if ( $this->data !== '' ) { + $data = $this->data; + $this->data = ''; + + return $data; + } + + return null; + } + + public function get_error(): ?string { + return $this->error; + } + + public function set_finished() { + $this->finished = true; + } +} + +class EchoStream implements WritableStream { + private $error = null; + + public function read(): bool { + return false; // EchoConsumer does not produce data + } + + public function write( string $data ): bool { + echo $data; + + return true; + } + + public function needs_more(): bool { + return true; + } + + public function is_finished(): bool { + return false; // EchoConsumer is never finished + } + + public function get_output(): ?string { + return null; // EchoConsumer does not have data to produce + } + + public function get_error(): ?string { + return $this->error; + } +} + + +class Pipe implements ReadableStream, WritableStream { + private $stages = []; + private $error = null; + private $finished = false; + private $dataBuffer = ''; + + static public function from( $stages ) { + if ( count( $stages ) === 0 ) { + throw new \InvalidArgumentException( 'Pipe must have at least one stage' ); + } + + for ( $i = 0; $i < count( $stages ) - 1; $i ++ ) { + if ( ! $stages[ $i ] instanceof ReadableStream ) { + throw new \InvalidArgumentException( 'All stages except the last one must be ReadableStreams, but ' . get_class( $stages[ $i ] ) . ' is not' ); + } + } + + for ( $i = 1; $i < count( $stages ); $i ++ ) { + if ( ! $stages[ $i ] instanceof WritableStream ) { + throw new \InvalidArgumentException( 'All stages except the first one must be WritableStream, but ' . get_class( $stages[ $i ] ) . ' is not' ); + } + } + + return new self( $stages ); + } + + private function __construct( $stages ) { + $this->stages = $stages; + } + + public function read(): bool { + $anyDataPiped = false; + + $stages = $this->stages; + for ( $i = 0; $i < count( $stages ) - 1; $i ++ ) { + $stage = $stages[ $i ]; + + $data = $stage->get_output(); + if ( null === $data ) { + if ( ! $stage->read() ) { + if ( $stage->get_error() ) { + $this->error = $stage->get_error(); + $this->is_finished = true; + + return false; + } + + if ( $stage->is_finished() ) { + continue; + } + break; + } + $data = $stage->get_output(); + } + + if ( null === $data ) { + break; + } + + $anyDataPiped = true; + $nextStage = $stages[ $i + 1 ]; + if ( ! $nextStage->write( $data ) ) { + $this->error = $nextStage->get_error(); + $this->is_finished = true; + break; + } + } + + $last_stage = $stages[ count( $stages ) - 1 ]; + if ( $last_stage instanceof ReadableStream && $last_stage->read() ) { + $this->dataBuffer .= $last_stage->get_output(); + if ( $last_stage->is_finished() ) { + $this->finished = true; + } + + return true; + } + + $first_stage = $stages[0]; + if ( ! $anyDataPiped && $first_stage->is_finished() ) { + $this->finished = true; + } + + return false; + } + + public function write( string $data ): bool { + if ( isset( $this->stages[0] ) && $this->stages[0] instanceof WritableStream ) { + return $this->stages[0]->write( $data ); + } + + return false; + } + + public function needs_more(): bool { + return ! $this->finished; + } + + public function get_output(): ?string { + $data = $this->dataBuffer; + $this->dataBuffer = ''; + + return $data; + } + + public function is_finished(): bool { + return $this->finished; + } + + public function get_error(): ?string { + return $this->error; + } +} + +function rewrite_url( + string $raw_matched_url, + $parsed_matched_url, + $parsed_current_site_url, + $parsed_new_site_url, +) { + // Let's rewrite the URL + $parsed_matched_url->hostname = $parsed_new_site_url->hostname; + $decoded_matched_pathname = urldecode( $parsed_matched_url->pathname ); + + // Short-circuit for empty pathnames + if ('/' !== $parsed_current_site_url->pathname) { + $parsed_matched_url->pathname = + $parsed_new_site_url->pathname . + substr( + $decoded_matched_pathname, + strlen(urldecode($parsed_current_site_url->pathname)) + ); + } + + /* + * Stylistic choice – if the matched URL has no trailing slash, + * do not add it to the new URL. The WHATWG URL parser will + * add one automatically if the path is empty, so we have to + * explicitly remove it. + */ + $new_raw_url = $parsed_matched_url->toString(); + if ( + $raw_matched_url[strlen($raw_matched_url) - 1] !== '/' && + $parsed_matched_url->pathname === '/' && + $parsed_matched_url->search === '' && + $parsed_matched_url->hash === '' + ) { + $new_raw_url = rtrim($new_raw_url, '/'); + } + + return $new_raw_url; +} + +function create_url_rewrite_stream( + $text, + $options +) { + $string_new_site_url = $options['to_url']; + $parsed_new_site_url = WP_URL::parse( $string_new_site_url ); + + $current_site_url = $options['from_url']; + $parsed_current_site_url = WP_URL::parse( $current_site_url ); + $decoded_current_site_pathname = urldecode( $parsed_current_site_url->pathname ); + + $base_url = 'https://playground.internal'; + return new BlockMarkupURLVisitorStream( + new WP_Block_Markup_Url_Processor( $text, $base_url ), + function(WP_Block_Markup_Url_Processor $p) use($parsed_current_site_url, $decoded_current_site_pathname, $parsed_new_site_url) { + $parsed_matched_url = $p->get_parsed_url(); + if ( $parsed_matched_url->hostname === $parsed_current_site_url->hostname ) { + $decoded_matched_pathname = urldecode( $parsed_matched_url->pathname ); + $pathname_matches = str_starts_with( $decoded_matched_pathname, $decoded_current_site_pathname ); + if ( ! $pathname_matches ) { + return; + } + // It's a match! + $p->set_raw_url( rewrite_url( + $p->get_raw_url(), + $parsed_matched_url, + $parsed_current_site_url, + $parsed_new_site_url + ) ); + } + } + ); +} + +function is_wxr_content_node( WP_XML_Processor $processor ) { + if ( ! in_array( 'item', $processor->get_breadcrumbs() ) ) { + return false; + } + if ( + ! in_array( 'excerpt:encoded', $processor->get_breadcrumbs() ) + && ! in_array( 'content:encoded', $processor->get_breadcrumbs() ) + && ! in_array( 'wp:attachment_url', $processor->get_breadcrumbs() ) + && ! in_array( 'guid', $processor->get_breadcrumbs() ) + && ! in_array( 'link', $processor->get_breadcrumbs() ) + && ! in_array( 'wp:comment_content', $processor->get_breadcrumbs() ) + // Meta values are not suppoerted yet. We'll need to support + // WordPress core options that may be saved as JSON, PHP Deserialization, and XML, + // and then provide extension points for plugins authors support + // their own options. + // !in_array('wp:postmeta', $processor->get_breadcrumbs()) + ) { + return false; + } + + switch ( $processor->get_token_type() ) { + case '#text': + case '#cdata-section': + return true; + } + + return false; +}; + +// Create the pipe and chain the stages +$pipe = Pipe::from( [ + new RequestStream( new Request( 'https://raw.githubusercontent.com/WordPress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/woo-products.wxr' ) ), + new XMLProcessorStream(function (WP_XML_Processor $processor) { + if(is_wxr_content_node($processor)) { + $text = $processor->get_modifiable_text(); + $pipe = Pipe::from([ + create_url_rewrite_stream( + $text, + [ + 'from_url' => 'https://raw.githubusercontent.com/wordpress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/wxr-assets/', + 'to_url' => 'https://mynew.site/', + ] + ) + ]); + while (!$pipe->is_finished()) { + $pipe->read(); + } + + $updated_text = $pipe->get_output(); + if ( $updated_text !== $text ) { + $processor->set_modifiable_text( $updated_text ); + } + } + }), + new EchoStream(), +] ); + +$i = 0; +// Process data incrementally as it becomes available +while ( ! $pipe->is_finished() ) { + if ( ++ $i > 22 ) { + // break; + } + if ( ! $pipe->read() ) { + // If no new data was produced, wait a bit before trying again + usleep( 100000 ); // Sleep for 100ms + } +} + +var_dump( $pipe ); From 736783ffb11fa8c002fdb4996191c9c373ba6dbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 16 Jul 2024 12:00:20 +0200 Subject: [PATCH 03/72] Use pipes for rewrite-remote-wxr.php --- pipes.php | 328 +++++++++++++++-------------------------- rewrite-remote-wxr.php | 163 +++----------------- 2 files changed, 142 insertions(+), 349 deletions(-) diff --git a/pipes.php b/pipes.php index 4576a74..e556b95 100644 --- a/pipes.php +++ b/pipes.php @@ -1,7 +1,5 @@ finished; } - public function get_output(): ?string { + public function consume_output(): ?string { if ( $this->buffer !== '' ) { $data = $this->buffer; $this->buffer = ''; @@ -66,6 +64,13 @@ public function get_error(): ?string; interface TransformStream extends ReadableStream, WritableStream { } +trait BaseTransformStream { + use BaseReadableStream, BaseWritableStream { + BaseReadableStream::get_error insteadof BaseWritableStream; + BaseReadableStream::set_error insteadof BaseWritableStream; + } +} + trait BaseWritableStream { protected $error = null; @@ -93,11 +98,9 @@ public function get_error(): ?string { } } -class BufferStream implements WritableStream { - - use BaseWritableStream; +class BufferStream implements TransformStream { - private $buffer = ''; + use BaseTransformStream; protected function doWrite( string $data ): bool { $this->buffer .= $data; @@ -105,18 +108,12 @@ protected function doWrite( string $data ): bool { return true; } - public function get_output(): ?string { - return $this->buffer; + protected function doRead(): bool { + return strlen( $this->buffer ) > 0; } } -trait BaseTransformStream { - use BaseReadableStream, BaseWritableStream { - BaseReadableStream::get_error insteadof BaseWritableStream; - BaseReadableStream::set_error insteadof BaseWritableStream; - } -} class BlockMarkupURLVisitorStream implements ReadableStream { @@ -232,89 +229,33 @@ protected function doRead(): bool { } -class UppercaseTransformer implements ReadableStream, WritableStream { - private $data = ''; - private $finished = false; - private $error = null; +abstract class StringTransformerStream implements TransformStream { + use BaseTransformStream; - public function read(): bool { - return ! empty( $this->data ); + protected function doRead(): bool { + return ! empty( $this->buffer ); } - public function write( string $data ): bool { - $this->data .= strtoupper( $data ); + protected function doWrite( string $data ): bool { + $this->buffer .= strtoupper( $data ); return true; } - public function needs_more(): bool { - return ! $this->finished; - } - - public function is_finished(): bool { - return $this->finished; - } - - public function get_output(): ?string { - if ( $this->data !== '' ) { - $data = $this->data; - $this->data = ''; - - return $data; - } - - return null; - } - - public function get_error(): ?string { - return $this->error; - } - - public function set_finished() { - $this->finished = true; - } + abstract protected function transform(string $data): ?string; } -class Rot13Transformer implements ReadableStream, WritableStream { - private $data = ''; - private $finished = false; - private $error = null; +class UppercaseTransformer extends StringTransformerStream { - public function read(): bool { - return ! empty( $this->data ); - } - - public function write( string $data ): bool { - $this->data .= str_rot13( $data ); - - return true; - } - - public function needs_more(): bool { - return ! $this->finished; - } - - public function is_finished(): bool { - return $this->finished; - } - - public function get_output(): ?string { - if ( $this->data !== '' ) { - $data = $this->data; - $this->data = ''; - - return $data; - } - - return null; + protected function transform( string $data ): ?string { + return strtoupper( $data ); } +} - public function get_error(): ?string { - return $this->error; - } +class Rot13Transformer extends StringTransformerStream { - public function set_finished() { - $this->finished = true; + protected function transform( string $data ): ?string { + return str_rot13( $data ); } } @@ -339,7 +280,7 @@ public function is_finished(): bool { return false; // EchoConsumer is never finished } - public function get_output(): ?string { + public function consume_output(): ?string { return null; // EchoConsumer does not have data to produce } @@ -355,6 +296,20 @@ class Pipe implements ReadableStream, WritableStream { private $finished = false; private $dataBuffer = ''; + static public function run($stages) + { + $pipe = Pipe::from( $stages ); + + while (!$pipe->is_finished()) { + if ( ! $pipe->read() ) { + // If no new data was produced, wait a bit before trying again + usleep( 10000 ); // Sleep for 10ms + } + } + + return $pipe->consume_output(); + } + static public function from( $stages ) { if ( count( $stages ) === 0 ) { throw new \InvalidArgumentException( 'Pipe must have at least one stage' ); @@ -386,7 +341,7 @@ public function read(): bool { for ( $i = 0; $i < count( $stages ) - 1; $i ++ ) { $stage = $stages[ $i ]; - $data = $stage->get_output(); + $data = $stage->consume_output(); if ( null === $data ) { if ( ! $stage->read() ) { if ( $stage->get_error() ) { @@ -401,7 +356,7 @@ public function read(): bool { } break; } - $data = $stage->get_output(); + $data = $stage->consume_output(); } if ( null === $data ) { @@ -419,7 +374,7 @@ public function read(): bool { $last_stage = $stages[ count( $stages ) - 1 ]; if ( $last_stage instanceof ReadableStream && $last_stage->read() ) { - $this->dataBuffer .= $last_stage->get_output(); + $this->dataBuffer .= $last_stage->consume_output(); if ( $last_stage->is_finished() ) { $this->finished = true; } @@ -436,18 +391,14 @@ public function read(): bool { } public function write( string $data ): bool { - if ( isset( $this->stages[0] ) && $this->stages[0] instanceof WritableStream ) { - return $this->stages[0]->write( $data ); - } - - return false; + return $this->stages[0]->write( $data ); } public function needs_more(): bool { return ! $this->finished; } - public function get_output(): ?string { + public function consume_output(): ?string { $data = $this->dataBuffer; $this->dataBuffer = ''; @@ -463,77 +414,84 @@ public function get_error(): ?string { } } -function rewrite_url( - string $raw_matched_url, - $parsed_matched_url, - $parsed_current_site_url, - $parsed_new_site_url, -) { - // Let's rewrite the URL - $parsed_matched_url->hostname = $parsed_new_site_url->hostname; - $decoded_matched_pathname = urldecode( $parsed_matched_url->pathname ); - - // Short-circuit for empty pathnames - if ('/' !== $parsed_current_site_url->pathname) { - $parsed_matched_url->pathname = - $parsed_new_site_url->pathname . - substr( - $decoded_matched_pathname, - strlen(urldecode($parsed_current_site_url->pathname)) - ); - } - - /* - * Stylistic choice – if the matched URL has no trailing slash, - * do not add it to the new URL. The WHATWG URL parser will - * add one automatically if the path is empty, so we have to - * explicitly remove it. - */ - $new_raw_url = $parsed_matched_url->toString(); - if ( - $raw_matched_url[strlen($raw_matched_url) - 1] !== '/' && - $parsed_matched_url->pathname === '/' && - $parsed_matched_url->search === '' && - $parsed_matched_url->hash === '' - ) { - $new_raw_url = rtrim($new_raw_url, '/'); - } - - return $new_raw_url; -} - -function create_url_rewrite_stream( - $text, - $options -) { - $string_new_site_url = $options['to_url']; - $parsed_new_site_url = WP_URL::parse( $string_new_site_url ); - - $current_site_url = $options['from_url']; - $parsed_current_site_url = WP_URL::parse( $current_site_url ); - $decoded_current_site_pathname = urldecode( $parsed_current_site_url->pathname ); - - $base_url = 'https://playground.internal'; - return new BlockMarkupURLVisitorStream( - new WP_Block_Markup_Url_Processor( $text, $base_url ), - function(WP_Block_Markup_Url_Processor $p) use($parsed_current_site_url, $decoded_current_site_pathname, $parsed_new_site_url) { - $parsed_matched_url = $p->get_parsed_url(); - if ( $parsed_matched_url->hostname === $parsed_current_site_url->hostname ) { - $decoded_matched_pathname = urldecode( $parsed_matched_url->pathname ); - $pathname_matches = str_starts_with( $decoded_matched_pathname, $decoded_current_site_pathname ); - if ( ! $pathname_matches ) { - return; - } - // It's a match! - $p->set_raw_url( rewrite_url( +class BlockMarkupURLRewriteStream extends BlockMarkupURLVisitorStream +{ + private $from_url; + private $parsed_from_url; + private $parsed_from_url_pathname; + + private $to_url; + private $parsed_to_url; + + private $base_url = 'https://playground.internal'; + + public function __construct($text, $options) + { + $this->from_url = $options['from_url']; + $this->parsed_from_url = WP_URL::parse($this->from_url); + $this->parsed_from_url_pathname = urldecode($this->parsed_from_url->pathname); + $this->to_url = $options['to_url']; + $this->parsed_to_url = WP_URL::parse($this->to_url); + + parent::__construct( + new WP_Block_Markup_Url_Processor($text, $this->from_url), + [$this, 'url_node_visitor'] + ); + } + + protected function url_node_visitor(WP_Block_Markup_Url_Processor $p) + { + $parsed_matched_url = $p->get_parsed_url(); + if ($parsed_matched_url->hostname === $this->parsed_from_url->hostname) { + $decoded_matched_pathname = urldecode($parsed_matched_url->pathname); + $pathname_matches = str_starts_with($decoded_matched_pathname, $this->parsed_from_url_pathname); + if (!$pathname_matches) { + return; + } + // It's a match! + $p->set_raw_url( + $this->rewrite_url( $p->get_raw_url(), $parsed_matched_url, - $parsed_current_site_url, - $parsed_new_site_url - ) ); - } + ) + ); } - ); + } + + public function rewrite_url( string $raw_matched_url, $parsed_matched_url ) { + // Let's rewrite the URL + $parsed_matched_url->hostname = $this->parsed_to_url->hostname; + $decoded_matched_pathname = urldecode($parsed_matched_url->pathname); + + // Short-circuit for empty pathnames + if ('/' !== $this->parsed_from_url->pathname) { + $parsed_matched_url->pathname = + $this->parsed_to_url->pathname . + substr( + $decoded_matched_pathname, + strlen(urldecode($this->parsed_from_url->pathname)) + ); + } + + /* + * Stylistic choice – if the matched URL has no trailing slash, + * do not add it to the new URL. The WHATWG URL parser will + * add one automatically if the path is empty, so we have to + * explicitly remove it. + */ + $new_raw_url = $parsed_matched_url->toString(); + if ( + $raw_matched_url[strlen($raw_matched_url) - 1] !== '/' && + $parsed_matched_url->pathname === '/' && + $parsed_matched_url->search === '' && + $parsed_matched_url->hash === '' + ) { + $new_raw_url = rtrim($new_raw_url, '/'); + } + + return $new_raw_url; + } + } function is_wxr_content_node( WP_XML_Processor $processor ) { @@ -564,45 +522,3 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { return false; }; - -// Create the pipe and chain the stages -$pipe = Pipe::from( [ - new RequestStream( new Request( 'https://raw.githubusercontent.com/WordPress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/woo-products.wxr' ) ), - new XMLProcessorStream(function (WP_XML_Processor $processor) { - if(is_wxr_content_node($processor)) { - $text = $processor->get_modifiable_text(); - $pipe = Pipe::from([ - create_url_rewrite_stream( - $text, - [ - 'from_url' => 'https://raw.githubusercontent.com/wordpress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/wxr-assets/', - 'to_url' => 'https://mynew.site/', - ] - ) - ]); - while (!$pipe->is_finished()) { - $pipe->read(); - } - - $updated_text = $pipe->get_output(); - if ( $updated_text !== $text ) { - $processor->set_modifiable_text( $updated_text ); - } - } - }), - new EchoStream(), -] ); - -$i = 0; -// Process data incrementally as it becomes available -while ( ! $pipe->is_finished() ) { - if ( ++ $i > 22 ) { - // break; - } - if ( ! $pipe->read() ) { - // If no new data was produced, wait a bit before trying again - usleep( 100000 ); // Sleep for 100ms - } -} - -var_dump( $pipe ); diff --git a/rewrite-remote-wxr.php b/rewrite-remote-wxr.php index b57e6ca..f662204 100644 --- a/rewrite-remote-wxr.php +++ b/rewrite-remote-wxr.php @@ -24,152 +24,29 @@ */ require __DIR__ . '/bootstrap.php'; +require __DIR__ . '/pipes.php'; -use \WordPress\AsyncHttp\Client; use \WordPress\AsyncHttp\Request; -$wxr_url = "https://raw.githubusercontent.com/WordPress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/woo-products.wxr"; -$xml_processor = new WP_XML_Processor('', [], WP_XML_Processor::IN_PROLOG_CONTEXT); -foreach( stream_remote_file( $wxr_url ) as $chunk ) { - $xml_processor->stream_append_xml($chunk); - foreach ( xml_next_content_node_for_rewriting( $xml_processor ) as $text ) { - $string_new_site_url = 'https://mynew.site/'; - $parsed_new_site_url = WP_URL::parse( $string_new_site_url ); - - $current_site_url = 'https://raw.githubusercontent.com/wordpress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/wxr-assets/'; - $parsed_current_site_url = WP_URL::parse( $current_site_url ); - - $base_url = 'https://playground.internal'; - $url_processor = new WP_Block_Markup_Url_Processor( $text, $base_url ); - - foreach ( html_next_url( $url_processor, $current_site_url ) as $parsed_matched_url ) { - $updated_raw_url = rewrite_url( - $url_processor->get_raw_url(), - $parsed_matched_url, - $parsed_current_site_url, - $parsed_new_site_url - ); - $url_processor->set_raw_url( $updated_raw_url ); - } - - $updated_text = $url_processor->get_updated_html(); - if ($updated_text !== $text) { - $xml_processor->set_modifiable_text($updated_text); - } - } - echo $xml_processor->get_processed_xml(); -} -echo $xml_processor->get_unprocessed_xml(); - -// The rest of this file are functions used in the above code - -function stream_remote_file($url) -{ - $requests = [ - new Request($url) - ]; - $client = new Client(); - $client->enqueue($requests); - - while ($client->await_next_event()) { - switch ($client->get_event()) { - case Client::EVENT_BODY_CHUNK_AVAILABLE: - yield $client->get_response_body_chunk(); - break; - } - } -} - -function xml_next_content_node_for_rewriting(WP_XML_Processor $processor) { - while($processor->next_token()) { - if (!in_array('item', $processor->get_breadcrumbs())) { - continue; - } - if ( - !in_array('excerpt:encoded', $processor->get_breadcrumbs()) - && !in_array('content:encoded', $processor->get_breadcrumbs()) - && !in_array('wp:attachment_url', $processor->get_breadcrumbs()) - && !in_array('guid', $processor->get_breadcrumbs()) - && !in_array('link', $processor->get_breadcrumbs()) - && !in_array('wp:comment_content', $processor->get_breadcrumbs()) - // Meta values are not suppoerted yet. We'll need to support - // WordPress core options that may be saved as JSON, PHP Deserialization, and XML, - // and then provide extension points for plugins authors support - // their own options. - // !in_array('wp:postmeta', $processor->get_breadcrumbs()) - ) { - continue; - } - - switch ($processor->get_token_type()) { - case '#text': - case '#cdata-section': - $text = $processor->get_modifiable_text(); - yield $text; - break; - } - } -} - -/** - * - * @param mixed $options - * @return Generator - */ -function html_next_url(WP_Block_Markup_Url_Processor $p, $current_site_url) { - $parsed_current_site_url = WP_URL::parse( $current_site_url ); - $decoded_current_site_pathname = urldecode( $parsed_current_site_url->pathname ); - - while ( $p->next_url() ) { - $parsed_matched_url = $p->get_parsed_url(); - if ( $parsed_matched_url->hostname === $parsed_current_site_url->hostname ) { - $decoded_matched_pathname = urldecode( $parsed_matched_url->pathname ); - $pathname_matches = str_starts_with( $decoded_matched_pathname, $decoded_current_site_pathname ); - if ( ! $pathname_matches ) { - continue; +Pipe::run( [ + new RequestStream( new Request( 'https://raw.githubusercontent.com/WordPress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/woo-products.wxr' ) ), + new XMLProcessorStream(function (WP_XML_Processor $processor) { + if(is_wxr_content_node($processor)) { + $text = $processor->get_modifiable_text(); + $updated_text = Pipe::run([ + new BlockMarkupURLRewriteStream( + $text, + [ + 'from_url' => 'https://raw.githubusercontent.com/wordpress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/wxr-assets/', + 'to_url' => 'https://mynew.site/', + ] + ), + ]); + if ( $updated_text !== $text ) { + $processor->set_modifiable_text( $updated_text ); } - - // It's a match! - yield $parsed_matched_url; } - } -} - -function rewrite_url( - string $raw_matched_url, - $parsed_matched_url, - $parsed_current_site_url, - $parsed_new_site_url, -) { - // Let's rewrite the URL - $parsed_matched_url->hostname = $parsed_new_site_url->hostname; - $decoded_matched_pathname = urldecode( $parsed_matched_url->pathname ); - - // Short-circuit for empty pathnames - if ('/' !== $parsed_current_site_url->pathname) { - $parsed_matched_url->pathname = - $parsed_new_site_url->pathname . - substr( - $decoded_matched_pathname, - strlen(urldecode($parsed_current_site_url->pathname)) - ); - } - - /* - * Stylistic choice – if the matched URL has no trailing slash, - * do not add it to the new URL. The WHATWG URL parser will - * add one automatically if the path is empty, so we have to - * explicitly remove it. - */ - $new_raw_url = $parsed_matched_url->toString(); - if ( - $raw_matched_url[strlen($raw_matched_url) - 1] !== '/' && - $parsed_matched_url->pathname === '/' && - $parsed_matched_url->search === '' && - $parsed_matched_url->hash === '' - ) { - $new_raw_url = rtrim($new_raw_url, '/'); - } + }), + new EchoStream(), +] ); - return $new_raw_url; -} From 22dea1dc50c0d013fe3d660343e6514228fc2acf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 16 Jul 2024 12:17:14 +0200 Subject: [PATCH 04/72] Remove needs_more() method from WritableStream --- pipes.php | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/pipes.php b/pipes.php index e556b95..8661a01 100644 --- a/pipes.php +++ b/pipes.php @@ -56,8 +56,6 @@ public function get_error(): ?string { interface WritableStream { public function write( string $data ): bool; - public function needs_more(): bool; - public function get_error(): ?string; } @@ -84,10 +82,6 @@ public function write( string $data ): bool { abstract protected function doWrite( string $data ): bool; - public function needs_more(): bool { - return true; - } - protected function set_error( string $error ) { $this->error = $error ?: 'unknown error'; $this->finished = true; @@ -272,10 +266,6 @@ public function write( string $data ): bool { return true; } - public function needs_more(): bool { - return true; - } - public function is_finished(): bool { return false; // EchoConsumer is never finished } @@ -394,10 +384,6 @@ public function write( string $data ): bool { return $this->stages[0]->write( $data ); } - public function needs_more(): bool { - return ! $this->finished; - } - public function consume_output(): ?string { $data = $this->dataBuffer; $this->dataBuffer = ''; From b4290f06aa204cfbd6cb1b664f0ffbed05acb9e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 17 Jul 2024 00:17:28 +0200 Subject: [PATCH 05/72] Filtering and demultiplexing via metadata piping --- blueprints-library | 2 +- pipes.php | 207 +++++++++++++++++++++++++++++++++++------ rewrite-remote-wxr.php | 68 ++++++++++---- 3 files changed, 227 insertions(+), 50 deletions(-) diff --git a/blueprints-library b/blueprints-library index e08b224..3b8943b 160000 --- a/blueprints-library +++ b/blueprints-library @@ -1 +1 @@ -Subproject commit e08b224738a5bc2a0a9674be32ed34af11e91b89 +Subproject commit 3b8943b4364ae276f7390e46c136136f48eca63a diff --git a/pipes.php b/pipes.php index 8661a01..01fae7d 100644 --- a/pipes.php +++ b/pipes.php @@ -5,18 +5,17 @@ interface ReadableStream { public function read(): bool; - public function is_finished(): bool; - public function consume_output(): ?string; - public function get_error(): ?string; + public function get_metadata(): ?StreamMetadata; } trait BaseReadableStream { protected $finished = false; protected $error = null; protected $buffer = ''; + protected $metadata = null; public function read(): bool { if ( $this->finished || $this->error ) { @@ -32,6 +31,11 @@ public function is_finished(): bool { return $this->finished; } + public function get_metadata(): ?StreamMetadata + { + return $this->metadata; + } + public function consume_output(): ?string { if ( $this->buffer !== '' ) { $data = $this->buffer; @@ -46,6 +50,7 @@ public function consume_output(): ?string { protected function set_error( string $error ) { $this->error = $error ?: 'unknown error'; $this->finished = true; + $this->metadata = null; } public function get_error(): ?string { @@ -54,7 +59,7 @@ public function get_error(): ?string { } interface WritableStream { - public function write( string $data ): bool; + public function write( string $data, ?StreamMetadata $metadata=null ): bool; public function get_error(): ?string; } @@ -71,18 +76,21 @@ trait BaseTransformStream { trait BaseWritableStream { protected $error = null; + protected $metadata = null; - public function write( string $data ): bool { + public function write( string $data, ?StreamMetadata $metadata=null ): bool { if ( $this->error ) { return false; } - return $this->doWrite( $data ); + $this->metadata = $metadata; + return $this->doWrite( $data, $metadata ); } - abstract protected function doWrite( string $data ): bool; + abstract protected function doWrite( string $data, ?StreamMetadata $metadata ): bool; protected function set_error( string $error ) { + $this->metadata = null; $this->error = $error ?: 'unknown error'; $this->finished = true; } @@ -96,7 +104,7 @@ class BufferStream implements TransformStream { use BaseTransformStream; - protected function doWrite( string $data ): bool { + protected function doWrite( string $data, ?StreamMetadata $metadata=null ): bool { $this->buffer .= $data; return true; @@ -151,7 +159,7 @@ public function __construct( $node_visitor_callback ) { $this->node_visitor_callback = $node_visitor_callback; } - protected function doWrite( string $data ): bool { + protected function doWrite( string $data, ?StreamMetadata $metadata=null ): bool { $this->xml_processor->stream_append_xml( $data ); return true; @@ -188,14 +196,82 @@ protected function doRead(): bool { } +class DemultiplexerStream implements TransformStream { + use BaseTransformStream; + + private $pipe_factory; + private $pipes = []; + private $next_read = []; + public function __construct( $pipe_factory ) { + $this->pipe_factory = $pipe_factory; + } + + protected function doWrite( string $data, ?StreamMetadata $metadata=null ): bool { + // -1 is the default stream ID used whenever we don't have any metadata + $stream_id = $metadata ? $metadata->get_resource_id() : -1; + if ( ! isset( $this->pipes[ $stream_id ] ) ) { + $pipe_factory = $this->pipe_factory; + $this->pipes[ $stream_id ] = $pipe_factory(); + } + + return $this->pipes[ $stream_id ]->write( $data, $metadata ); + } + + protected function doRead(): bool { + if(empty($this->next_read)) { + $this->next_read = array_keys($this->pipes); + } + + while (count($this->next_read)) { + $stream_id = array_shift($this->next_read); + if (!isset($this->pipes[$stream_id])) { + continue; + } + + $pipe = $this->pipes[$stream_id]; + if(!($pipe instanceof ReadableStream)) { + // @TODO: What if the last pipe in the demultiplexer is not readable? + // Then the entire multiplexer is not readable. + // We need to conider this somehow in the Pipe class + // around this line: + // if ( $last_stage instanceof ReadableStream && $last_stage->read() ) { + return false; + } + if (!$pipe->read()) { + if ($pipe->is_finished()) { + unset($this->pipes[$stream_id]); + } + continue; + } + + $this->buffer .= $pipe->consume_output(); + $this->metadata = $pipe->get_metadata(); + return true; + } + + return false; + } + +} + class RequestStream implements ReadableStream { use BaseReadableStream; private $client; + private $requests = []; + private $requests_metadata = []; - public function __construct( Request $request ) { + public function __construct( $requests ) { $this->client = new Client(); - $this->client->enqueue( [ $request ] ); + $this->client->enqueue( $requests ); + + $this->requests = $requests; + foreach($requests as $request) { + $this->requests_metadata[$request->id] = new BasicStreamMetadata( + $request->id, + $request->url + ); + } } protected function doRead(): bool { @@ -205,16 +281,21 @@ protected function doRead(): bool { return false; } + $request = $this->client->get_request(); + $this->metadata = $this->requests_metadata[$request->id]; switch ( $this->client->get_event() ) { case Client::EVENT_BODY_CHUNK_AVAILABLE: $this->buffer .= $this->client->get_response_body_chunk(); - return true; case Client::EVENT_FAILED: - $this->set_error( $this->client->get_request()->error ?: 'unknown error' ); + // @TODO: Handling errors. + // We don't want to stop everything if one request fails. + $this->set_error( $request->error ?: 'unknown error' ); break; case Client::EVENT_FINISHED: - $this->finished = true; + if(count($this->client->get_active_requests()) === 0) { + $this->finished = true; + } break; } @@ -230,8 +311,8 @@ protected function doRead(): bool { return ! empty( $this->buffer ); } - protected function doWrite( string $data ): bool { - $this->buffer .= strtoupper( $data ); + protected function doWrite( string $data, ?StreamMetadata $metadata=null ): bool { + $this->buffer .= $this->transform( $data ); return true; } @@ -240,38 +321,69 @@ abstract protected function transform(string $data): ?string; } class UppercaseTransformer extends StringTransformerStream { - protected function transform( string $data ): ?string { return strtoupper( $data ); } } class Rot13Transformer extends StringTransformerStream { - protected function transform( string $data ): ?string { return str_rot13( $data ); } } -class EchoStream implements WritableStream { - private $error = null; +class EchoTransformer extends StringTransformerStream { + protected function transform( string $data ): ?string { + echo $data; + return $data; + } +} - public function read(): bool { - return false; // EchoConsumer does not produce data +class FilterStream implements TransformStream { + use BaseTransformStream; + + private $filter_callback; + + public function __construct( $filter_callback ) { + $this->filter_callback = $filter_callback; } - public function write( string $data ): bool { - echo $data; + protected function doRead(): bool { + return ! empty( $this->buffer ); + } + protected function doWrite( string $data, ?StreamMetadata $metadata=null ): bool { + $filter_callback = $this->filter_callback; + if ( $filter_callback( $metadata ) ) { + $this->buffer .= $data; + } else { + $this->buffer = ''; + $this->metadata = null; + } return true; } +} - public function is_finished(): bool { - return false; // EchoConsumer is never finished +class LocalFileStream implements WritableStream { + private $error = null; + private $filename_factory; + private $fp; + + public function __construct($filename_factory) + { + $this->filename_factory = $filename_factory; } - public function consume_output(): ?string { - return null; // EchoConsumer does not have data to produce + public function write( string $data, ?StreamMetadata $metadata=null ): bool { + if ( ! $this->fp ) { + $filename_factory = $this->filename_factory; + $filename = $filename_factory($metadata); + // @TODO: we'll need to close this. We could use a close() or cleanup() method here. + $this->fp = fopen($filename, 'wb'); + } + + fwrite($this->fp, $data); + return true; } public function get_error(): ?string { @@ -279,6 +391,34 @@ public function get_error(): ?string { } } +/** + * Extend this class when more metadata is needed. + */ +interface StreamMetadata { + public function get_resource_id(); + public function get_filename(); +} + +class BasicStreamMetadata implements StreamMetadata { + private $resource_id; + private $filename; + + public function __construct($resource_id, $filename=null) + { + $this->resource_id = $resource_id; + $this->filename = $filename; + } + + public function get_resource_id() + { + return $this->resource_id; + } + + public function get_filename() + { + return $this->filename; + } +} class Pipe implements ReadableStream, WritableStream { private $stages = []; @@ -355,7 +495,7 @@ public function read(): bool { $anyDataPiped = true; $nextStage = $stages[ $i + 1 ]; - if ( ! $nextStage->write( $data ) ) { + if ( ! $nextStage->write( $data, $stage->get_metadata() ) ) { $this->error = $nextStage->get_error(); $this->is_finished = true; break; @@ -380,8 +520,8 @@ public function read(): bool { return false; } - public function write( string $data ): bool { - return $this->stages[0]->write( $data ); + public function write( string $data, ?StreamMetadata $metadata = null ): bool { + return $this->stages[0]->write( $data, $metadata ); } public function consume_output(): ?string { @@ -391,6 +531,11 @@ public function consume_output(): ?string { return $data; } + public function get_metadata(): ?StreamMetadata + { + return $this->stages[ count( $this->stages ) - 1 ]->get_metadata(); + } + public function is_finished(): bool { return $this->finished; } diff --git a/rewrite-remote-wxr.php b/rewrite-remote-wxr.php index f662204..35c668d 100644 --- a/rewrite-remote-wxr.php +++ b/rewrite-remote-wxr.php @@ -28,25 +28,57 @@ use \WordPress\AsyncHttp\Request; +// Pipe::run( [ +// new RequestStream( [ new Request( 'https://raw.githubusercontent.com/WordPress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/woo-products.wxr' ) ] ), +// new XMLProcessorStream(function (WP_XML_Processor $processor) { +// if(is_wxr_content_node($processor)) { +// $text = $processor->get_modifiable_text(); +// $updated_text = Pipe::run([ +// new BlockMarkupURLRewriteStream( +// $text, +// [ +// 'from_url' => 'https://raw.githubusercontent.com/wordpress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/wxr-assets/', +// 'to_url' => 'https://mynew.site/', +// ] +// ), +// ]); +// if ( $updated_text !== $text ) { +// $processor->set_modifiable_text( $updated_text ); +// } +// } +// }), +// new EchoStream(), +// ] ); + + Pipe::run( [ - new RequestStream( new Request( 'https://raw.githubusercontent.com/WordPress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/woo-products.wxr' ) ), - new XMLProcessorStream(function (WP_XML_Processor $processor) { - if(is_wxr_content_node($processor)) { - $text = $processor->get_modifiable_text(); - $updated_text = Pipe::run([ - new BlockMarkupURLRewriteStream( - $text, - [ - 'from_url' => 'https://raw.githubusercontent.com/wordpress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/wxr-assets/', - 'to_url' => 'https://mynew.site/', - ] - ), - ]); - if ( $updated_text !== $text ) { - $processor->set_modifiable_text( $updated_text ); + new RequestStream( [ + new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/README.md' ), + new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini' ), + new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/phpcs.xml' ), + ] ), + new FilterStream( fn ($metadata) => ! str_ends_with( $metadata->get_filename(), '.md' ) ), + new DemultiplexerStream(fn () => Pipe::from([ + new XMLProcessorStream(function (WP_XML_Processor $processor) { + if(is_wxr_content_node($processor)) { + $text = $processor->get_modifiable_text(); + $updated_text = Pipe::run([ + new BlockMarkupURLRewriteStream( + $text, + [ + 'from_url' => 'https://raw.githubusercontent.com/wordpress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/wxr-assets/', + 'to_url' => 'https://mynew.site/', + ] + ), + ]); + if ( $updated_text !== $text ) { + $processor->set_modifiable_text( $updated_text ); + } } - } - }), - new EchoStream(), + }), + new EchoTransformer(), + new LocalFileStream(fn ($metadata) => __DIR__ . '/output/' . $metadata->get_resource_id() . '.chunk') + ])), ] ); + From 759679400e52b9e959d7980200968b999bf0e546 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 17 Jul 2024 00:44:11 +0200 Subject: [PATCH 06/72] Use a fancier pipe --- pipes.php | 20 ++++++++++++++++++- rewrite-remote-wxr.php | 45 +++++++++++++++++++++++------------------- 2 files changed, 44 insertions(+), 21 deletions(-) diff --git a/pipes.php b/pipes.php index 01fae7d..0e2c400 100644 --- a/pipes.php +++ b/pipes.php @@ -364,7 +364,7 @@ protected function doWrite( string $data, ?StreamMetadata $metadata=null ): bool } } -class LocalFileStream implements WritableStream { +class LocalFileStream implements WritableStream, ReadableStream { private $error = null; private $filename_factory; private $fp; @@ -389,6 +389,24 @@ public function write( string $data, ?StreamMetadata $metadata=null ): bool { public function get_error(): ?string { return $this->error; } + + // Temporary workaround to keep the Pipe class working + public function read(): bool { + return false; + } + + public function is_finished(): bool { + return false; + } + + public function consume_output(): ?string { + return null; + } + + public function get_metadata(): ?StreamMetadata + { + return null; + } } /** diff --git a/rewrite-remote-wxr.php b/rewrite-remote-wxr.php index 35c668d..23c80a4 100644 --- a/rewrite-remote-wxr.php +++ b/rewrite-remote-wxr.php @@ -50,34 +50,39 @@ // new EchoStream(), // ] ); +$wxr_rewriter = fn() => new XMLProcessorStream(function (WP_XML_Processor $processor) { + if (is_wxr_content_node($processor)) { + $text = $processor->get_modifiable_text(); + $updated_text = Pipe::run([ + new BlockMarkupURLRewriteStream( + $text, + [ + 'from_url' => 'https://raw.githubusercontent.com/wordpress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/wxr-assets/', + 'to_url' => 'https://mynew.site/', + ] + ), + ]); + if ($updated_text !== $text) { + $processor->set_modifiable_text($updated_text); + } + } +}); Pipe::run( [ new RequestStream( [ - new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/README.md' ), new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini' ), new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/phpcs.xml' ), + new Request( 'https://raw.githubusercontent.com/WordPress/blueprints/trunk/blueprints/stylish-press/site-content.wxr' ), ] ), - new FilterStream( fn ($metadata) => ! str_ends_with( $metadata->get_filename(), '.md' ) ), + new FilterStream( fn ($metadata) => ( + str_ends_with( $metadata->get_filename(), '.xml' ) || + str_ends_with( $metadata->get_filename(), '.wxr' ) + ) ), + new DemultiplexerStream(fn () => $wxr_rewriter()), + new UppercaseTransformer(), new DemultiplexerStream(fn () => Pipe::from([ - new XMLProcessorStream(function (WP_XML_Processor $processor) { - if(is_wxr_content_node($processor)) { - $text = $processor->get_modifiable_text(); - $updated_text = Pipe::run([ - new BlockMarkupURLRewriteStream( - $text, - [ - 'from_url' => 'https://raw.githubusercontent.com/wordpress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/wxr-assets/', - 'to_url' => 'https://mynew.site/', - ] - ), - ]); - if ( $updated_text !== $text ) { - $processor->set_modifiable_text( $updated_text ); - } - } - }), new EchoTransformer(), - new LocalFileStream(fn ($metadata) => __DIR__ . '/output/' . $metadata->get_resource_id() . '.chunk') + new LocalFileStream(fn ($metadata) => __DIR__ . '/output/' . $metadata->get_resource_id() . '.chunk'), ])), ] ); From 17c5950436213d82db337dab5d26dd5e281e5eb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sat, 20 Jul 2024 13:22:22 +0200 Subject: [PATCH 07/72] Explore automatic demultiplexing based on the stream class definition --- pipes.php | 147 ++++++++++++++++++++++++++++++++++++++++- rewrite-remote-wxr.php | 56 ++++++++++++---- 2 files changed, 189 insertions(+), 14 deletions(-) diff --git a/pipes.php b/pipes.php index 0e2c400..5b15a67 100644 --- a/pipes.php +++ b/pipes.php @@ -148,6 +148,16 @@ protected function doRead(): bool { } +class XML_Processor { + + static public function stream($node_visitor_callback) + { + return new Demultiplexer( + fn() => new XMLProcessorStream($node_visitor_callback) + ); + } +} + class XMLProcessorStream implements TransformStream { use BaseTransformStream; @@ -254,6 +264,15 @@ protected function doRead(): bool { } +class HttpClient { + + static public function stream($requests) + { + return new RequestStream($requests); + } + +} + class RequestStream implements ReadableStream { use BaseReadableStream; @@ -364,11 +383,19 @@ protected function doWrite( string $data, ?StreamMetadata $metadata=null ): bool } } -class LocalFileStream implements WritableStream, ReadableStream { +class LocalFileWriter implements WritableStream, ReadableStream { private $error = null; private $filename_factory; + private $last_written_chunk; + private $buffer; private $fp; + static public function stream( $filename_factory ) { + return new Demultiplexer( + fn() => new self( $filename_factory ) + ); + } + public function __construct($filename_factory) { $this->filename_factory = $filename_factory; @@ -382,6 +409,7 @@ public function write( string $data, ?StreamMetadata $metadata=null ): bool { $this->fp = fopen($filename, 'wb'); } + $this->last_written_chunk = $data; fwrite($this->fp, $data); return true; } @@ -392,6 +420,11 @@ public function get_error(): ?string { // Temporary workaround to keep the Pipe class working public function read(): bool { + if($this->last_written_chunk) { + $this->buffer = $this->last_written_chunk; + $this->last_written_chunk = null; + return true; + } return false; } @@ -400,6 +433,11 @@ public function is_finished(): bool { } public function consume_output(): ?string { + if($this->buffer) { + $chunk = $this->buffer; + $this->buffer = null; + return $chunk; + } return null; } @@ -438,6 +476,113 @@ public function get_filename() } } +class Demultiplexer implements ReadableStream, WritableStream +{ + + public $factory_function; + private $stream_instances = []; + + public function __construct( + $factory_function + ) { + $this->factory_function = $factory_function; + } + + public function write( string $data, ?StreamMetadata $metadata=null ): bool { + if ( $this->error ) { + return false; + } + + $resource_id = $metadata ? $metadata->get_resource_id() : 'default'; + $stream_factory = $this->factory_function; + if(!isset($this->stream_instances[$resource_id])) { + $this->stream_instances[$resource_id] = $stream_factory(); + } + $stream = $this->stream_instances[$resource_id]; + $retval = $stream->write( $data, $metadata ); + if ( ! $retval ) { + $this->error = $stream->get_error(); + } + return $retval; + } + + private $read_queue = []; + private $last_read_stream = null; + private $finished = false; + public function read(): bool + { + $available_streams = count($this->stream_instances); + if(0 === $available_streams) { + $this->stream_instances = [ + 'default' => ($this->factory_function)() + ]; + $available_streams = 1; + } + + $processed_streams = 0; + do { + if (empty($this->read_queue)) { + $this->read_queue = $this->stream_instances; + } + + $stream = array_shift($this->read_queue); + if ($stream->read()) { + $this->last_read_stream = $stream; + return true; + } + + if ( $stream->get_error() ) { + $this->error = $stream->get_error(); + $this->is_finished = true; + + return false; + } + + ++$processed_streams; + + if ( $stream->is_finished() ) { + // @TODO: Handle this case, track which streams are finished + // and take them off the instances list and the read queue. + } + } while ($processed_streams < $available_streams); + return false; + } + + public function consume_output(): ?string { + return $this->last_read_stream ? $this->last_read_stream->consume_output() : null; + } + + public function get_metadata(): ?StreamMetadata + { + return $this->last_read_stream ? $this->last_read_stream->get_metadata() : null; + } + + public function is_finished(): bool { + $finished = true; + foreach($this->stream_instances as $stream) { + if(!$stream->is_finished()) { + $finished = false; + break; + } + } + return $finished; + } + + protected $error = null; + + protected function set_error( string $error ) { + $this->metadata = null; + $this->error = $error ?: 'unknown error'; + $this->finished = true; + } + + public function get_error(): ?string { + return $this->error; + } + +} + + class Pipe implements ReadableStream, WritableStream { private $stages = []; private $error = null; diff --git a/rewrite-remote-wxr.php b/rewrite-remote-wxr.php index 23c80a4..ce47832 100644 --- a/rewrite-remote-wxr.php +++ b/rewrite-remote-wxr.php @@ -50,7 +50,7 @@ // new EchoStream(), // ] ); -$wxr_rewriter = fn() => new XMLProcessorStream(function (WP_XML_Processor $processor) { +$rewrite_links_in_wxr_node = function (WP_XML_Processor $processor) { if (is_wxr_content_node($processor)) { $text = $processor->get_modifiable_text(); $updated_text = Pipe::run([ @@ -66,24 +66,54 @@ $processor->set_modifiable_text($updated_text); } } -}); +}; Pipe::run( [ - new RequestStream( [ + HttpClient::stream( [ + new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini' ), new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini' ), new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/phpcs.xml' ), + new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/phpcs.xml?a' ), new Request( 'https://raw.githubusercontent.com/WordPress/blueprints/trunk/blueprints/stylish-press/site-content.wxr' ), ] ), - new FilterStream( fn ($metadata) => ( - str_ends_with( $metadata->get_filename(), '.xml' ) || - str_ends_with( $metadata->get_filename(), '.wxr' ) - ) ), - new DemultiplexerStream(fn () => $wxr_rewriter()), - new UppercaseTransformer(), - new DemultiplexerStream(fn () => Pipe::from([ - new EchoTransformer(), - new LocalFileStream(fn ($metadata) => __DIR__ . '/output/' . $metadata->get_resource_id() . '.chunk'), - ])), + XML_Processor::stream($rewrite_links_in_wxr_node), + LocalFileWriter::stream(fn ($context) => __DIR__ . '/output/' . $context->get_resource_id() . '.chunk'), ] ); +// while ( $pipe->next() ) { +// list( 'http' => $http, 'zip' => $zip ) = $pipe->get_context(); + +// if ( ! str_ends_with( $zip->get_filename(), '.wxr' ) ) { +// $zip->skip_file(); +// continue; +// } + +// switch( $zip->get_filename() ) { +// case 'site-content.wxr': +// $pipe->write( $xml->get_contents() ); +// break; +// } +// } + +// Pipe::run( [ +// 'http' => new RequestStream( [ /* ... */ ] ), +// 'zip' => new ZipReaderStream( fn ($context) => { +// if(!str_ends_with($context['http']->url, '.zip')) { +// return $context['zip']->skip(); +// } +// } ), +// 'xml' => new XMLProcessorStream(fn ($context) => { +// if(!str_ends_with($context['zip']->filename, '.wxr')) { +// return $context['zip']->skip(); +// } + +// $xml_processor = $context['xml']->get_or_create_processor( $context['zip']->filename ); +// if(!WXR_Processor::is_content_node($xml_processor)) { +// continue; +// } + +// // Migrate URLs and downlaod assets +// }), +// ] ); + From 86316f39164d82bc1470a3701bf7380b20ce1f82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sat, 20 Jul 2024 19:23:23 +0200 Subject: [PATCH 08/72] Fix the intermittent broken pipe --- pipes.php | 51 +++++++++++++++++++++--------------------- rewrite-remote-wxr.php | 22 ++++++++++++++++-- 2 files changed, 46 insertions(+), 27 deletions(-) diff --git a/pipes.php b/pipes.php index 5b15a67..f089503 100644 --- a/pipes.php +++ b/pipes.php @@ -149,9 +149,7 @@ protected function doRead(): bool { } class XML_Processor { - - static public function stream($node_visitor_callback) - { + static public function stream($node_visitor_callback) { return new Demultiplexer( fn() => new XMLProcessorStream($node_visitor_callback) ); @@ -230,6 +228,10 @@ protected function doWrite( string $data, ?StreamMetadata $metadata=null ): bool protected function doRead(): bool { if(empty($this->next_read)) { $this->next_read = array_keys($this->pipes); + if(empty($this->pipes)) { + $this->finished = true; + return false; + } } while (count($this->next_read)) { @@ -265,12 +267,9 @@ protected function doRead(): bool { } class HttpClient { - - static public function stream($requests) - { + static public function stream($requests) { return new RequestStream($requests); } - } class RequestStream implements ReadableStream { @@ -312,9 +311,8 @@ protected function doRead(): bool { $this->set_error( $request->error ?: 'unknown error' ); break; case Client::EVENT_FINISHED: - if(count($this->client->get_active_requests()) === 0) { - $this->finished = true; - } + // @TODO: Mark this particular resource as finished without + // closing the entire Client stream. break; } @@ -523,9 +521,18 @@ public function read(): bool do { if (empty($this->read_queue)) { $this->read_queue = $this->stream_instances; + if (empty($this->read_queue)) { + return false; + } } $stream = array_shift($this->read_queue); + if ( $stream->is_finished() ) { + $index = array_search($stream, $this->stream_instances, true); + unset($this->stream_instances[$index]); + continue; + } + if ($stream->read()) { $this->last_read_stream = $stream; return true; @@ -539,11 +546,6 @@ public function read(): bool } ++$processed_streams; - - if ( $stream->is_finished() ) { - // @TODO: Handle this case, track which streams are finished - // and take them off the instances list and the read queue. - } } while ($processed_streams < $available_streams); return false; } @@ -558,14 +560,7 @@ public function get_metadata(): ?StreamMetadata } public function is_finished(): bool { - $finished = true; - foreach($this->stream_instances as $stream) { - if(!$stream->is_finished()) { - $finished = false; - break; - } - } - return $finished; + return count($this->stream_instances) === 0; } protected $error = null; @@ -593,7 +588,7 @@ static public function run($stages) { $pipe = Pipe::from( $stages ); - while (!$pipe->is_finished()) { + while ( ! $pipe->is_finished() ) { if ( ! $pipe->read() ) { // If no new data was produced, wait a bit before trying again usleep( 10000 ); // Sleep for 10ms @@ -628,6 +623,9 @@ private function __construct( $stages ) { } public function read(): bool { + if($this->finished) { + return false; + } $anyDataPiped = false; $stages = $this->stages; @@ -647,7 +645,10 @@ public function read(): bool { if ( $stage->is_finished() ) { continue; } - break; + + // No data was produced by the stage, let's try again on the next read() call, + // and meanwhile let's see if the rest of the pipe will produce any data. + continue; } $data = $stage->consume_output(); } diff --git a/rewrite-remote-wxr.php b/rewrite-remote-wxr.php index ce47832..79166db 100644 --- a/rewrite-remote-wxr.php +++ b/rewrite-remote-wxr.php @@ -68,6 +68,22 @@ } }; + +// $client = new WordPress\AsyncHttp\Client(); +// $client->enqueue( [ +// new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini' ), +// new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini' ), +// new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/phpcs.xml' ), +// new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/phpcs.xml?a' ), +// new Request( 'https://raw.githubusercontent.com/WordPress/blueprints/trunk/blueprints/stylish-press/site-content.wxr' ), +// ] ); + +// while ( $client->await_next_event() ) { +// var_dump($client->get_event()); +// } + +// die(); + Pipe::run( [ HttpClient::stream( [ new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini' ), @@ -76,11 +92,13 @@ new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/phpcs.xml?a' ), new Request( 'https://raw.githubusercontent.com/WordPress/blueprints/trunk/blueprints/stylish-press/site-content.wxr' ), ] ), - XML_Processor::stream($rewrite_links_in_wxr_node), - LocalFileWriter::stream(fn ($context) => __DIR__ . '/output/' . $context->get_resource_id() . '.chunk'), + XML_Processor::stream( $rewrite_links_in_wxr_node ), + LocalFileWriter::stream( fn ($context) => __DIR__ . '/output/' . $context->get_resource_id() . '.chunk' ), ] ); + + // while ( $pipe->next() ) { // list( 'http' => $http, 'zip' => $zip ) = $pipe->get_context(); From 542ac35f204f073ce22f9e0fac45c4b6da222358 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sat, 20 Jul 2024 19:55:09 +0200 Subject: [PATCH 09/72] Explore streams as iterators --- pipes.php | 52 +++++++++++++++++++++++++++++++++++++++++- rewrite-remote-wxr.php | 23 ++++--------------- 2 files changed, 56 insertions(+), 19 deletions(-) diff --git a/pipes.php b/pipes.php index f089503..c71befa 100644 --- a/pipes.php +++ b/pipes.php @@ -3,7 +3,7 @@ use \WordPress\AsyncHttp\Client; use \WordPress\AsyncHttp\Request; -interface ReadableStream { +interface ReadableStream extends Iterator { public function read(): bool; public function is_finished(): bool; public function consume_output(): ?string; @@ -11,7 +11,51 @@ public function get_error(): ?string; public function get_metadata(): ?StreamMetadata; } +trait ReadableStreamIterator +{ + private $position = 0; + private $iterator_output_cache = null; + + public function current(): mixed { + if(null === $this->iterator_output_cache) { + $this->iterator_output_cache = $this->consume_output(); + } + return (object) [ + 'bytes' => $this->iterator_output_cache, + 'metadata' => $this->get_metadata(), + ]; + } + + public function key(): mixed { + return $this->position; + } + + public function next(): void { + $this->iterator_output_cache = null; + while(false === $this->read()) { + if ($this->is_finished()) { + return; + } + if($this->get_error()) { + return; + } + usleep(10000); + } + } + + public function rewind(): void { + $this->position = 0; + $this->next(); + } + + public function valid(): bool { + return !$this->is_finished(); + } +} + trait BaseReadableStream { + use ReadableStreamIterator; + protected $finished = false; protected $error = null; protected $buffer = ''; @@ -388,6 +432,8 @@ class LocalFileWriter implements WritableStream, ReadableStream { private $buffer; private $fp; + use ReadableStreamIterator; + static public function stream( $filename_factory ) { return new Demultiplexer( fn() => new self( $filename_factory ) @@ -477,6 +523,8 @@ public function get_filename() class Demultiplexer implements ReadableStream, WritableStream { + use ReadableStreamIterator; + public $factory_function; private $stream_instances = []; @@ -584,6 +632,8 @@ class Pipe implements ReadableStream, WritableStream { private $finished = false; private $dataBuffer = ''; + use ReadableStreamIterator; + static public function run($stages) { $pipe = Pipe::from( $stages ); diff --git a/rewrite-remote-wxr.php b/rewrite-remote-wxr.php index 79166db..fae7b20 100644 --- a/rewrite-remote-wxr.php +++ b/rewrite-remote-wxr.php @@ -68,23 +68,7 @@ } }; - -// $client = new WordPress\AsyncHttp\Client(); -// $client->enqueue( [ -// new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini' ), -// new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini' ), -// new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/phpcs.xml' ), -// new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/phpcs.xml?a' ), -// new Request( 'https://raw.githubusercontent.com/WordPress/blueprints/trunk/blueprints/stylish-press/site-content.wxr' ), -// ] ); - -// while ( $client->await_next_event() ) { -// var_dump($client->get_event()); -// } - -// die(); - -Pipe::run( [ +$pipe = Pipe::from( [ HttpClient::stream( [ new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini' ), new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini' ), @@ -93,9 +77,12 @@ new Request( 'https://raw.githubusercontent.com/WordPress/blueprints/trunk/blueprints/stylish-press/site-content.wxr' ), ] ), XML_Processor::stream( $rewrite_links_in_wxr_node ), - LocalFileWriter::stream( fn ($context) => __DIR__ . '/output/' . $context->get_resource_id() . '.chunk' ), + // LocalFileWriter::stream( fn ($context) => __DIR__ . '/output/' . $context->get_resource_id() . '.chunk' ), ] ); +foreach($pipe as $context) { + var_dump($context); +} From 75f12176187e96a7c189b9986a5c3967064939b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 21 Jul 2024 01:28:34 +0200 Subject: [PATCH 10/72] Explore PipeContext --- pipes.php | 113 +++++++++++++++++++++++++++++++++++++---- rewrite-remote-wxr.php | 65 +++++++++--------------- 2 files changed, 127 insertions(+), 51 deletions(-) diff --git a/pipes.php b/pipes.php index c71befa..7e1b37c 100644 --- a/pipes.php +++ b/pipes.php @@ -430,6 +430,7 @@ class LocalFileWriter implements WritableStream, ReadableStream { private $filename_factory; private $last_written_chunk; private $buffer; + private $metadata; private $fp; use ReadableStreamIterator; @@ -449,6 +450,7 @@ public function write( string $data, ?StreamMetadata $metadata=null ): bool { if ( ! $this->fp ) { $filename_factory = $this->filename_factory; $filename = $filename_factory($metadata); + $this->metadata = new BasicStreamMetadata($filename, $filename); // @TODO: we'll need to close this. We could use a close() or cleanup() method here. $this->fp = fopen($filename, 'wb'); } @@ -487,7 +489,7 @@ public function consume_output(): ?string { public function get_metadata(): ?StreamMetadata { - return null; + return $this->metadata; } } @@ -627,15 +629,36 @@ public function get_error(): ?string { class Pipe implements ReadableStream, WritableStream { + private $stages_keys = []; private $stages = []; private $error = null; + private $context = null; private $finished = false; private $dataBuffer = ''; use ReadableStreamIterator; - static public function run($stages) + static public function get_output($stages) { + return self::run($stages, ['buffer_output' => true]); + } + + public function current(): mixed { + if(null === $this->iterator_output_cache) { + $this->iterator_output_cache = $this->consume_output(); + } + return $this->context; + // (object) [ + // 'bytes' => $this->iterator_output_cache, + // 'metadata' => $this->get_metadata(), + // ]; + } + + static public function run($stages, $options=array()) + { + $options = array_merge([ + 'buffer_output' => false, + ], $options); $pipe = Pipe::from( $stages ); while ( ! $pipe->is_finished() ) { @@ -643,8 +666,11 @@ static public function run($stages) // If no new data was produced, wait a bit before trying again usleep( 10000 ); // Sleep for 10ms } - } + if(!$options['buffer_output']) { + $pipe->consume_output(); + } + } return $pipe->consume_output(); } @@ -653,15 +679,16 @@ static public function from( $stages ) { throw new \InvalidArgumentException( 'Pipe must have at least one stage' ); } - for ( $i = 0; $i < count( $stages ) - 1; $i ++ ) { - if ( ! $stages[ $i ] instanceof ReadableStream ) { - throw new \InvalidArgumentException( 'All stages except the last one must be ReadableStreams, but ' . get_class( $stages[ $i ] ) . ' is not' ); + $stages_values = array_values($stages); + for ( $i = 0; $i < count( $stages_values ) - 1; $i ++ ) { + if ( ! $stages_values[ $i ] instanceof ReadableStream ) { + throw new \InvalidArgumentException( 'All stages except the last one must be ReadableStreams, but ' . get_class( $stages_values[ $i ] ) . ' is not' ); } } - for ( $i = 1; $i < count( $stages ); $i ++ ) { - if ( ! $stages[ $i ] instanceof WritableStream ) { - throw new \InvalidArgumentException( 'All stages except the first one must be WritableStream, but ' . get_class( $stages[ $i ] ) . ' is not' ); + for ( $i = 1; $i < count( $stages_values ); $i ++ ) { + if ( ! $stages_values[ $i ] instanceof WritableStream ) { + throw new \InvalidArgumentException( 'All stages except the first one must be WritableStream, but ' . get_class( $stages_values[ $i ] ) . ' is not' ); } } @@ -669,7 +696,8 @@ static public function from( $stages ) { } private function __construct( $stages ) { - $this->stages = $stages; + $this->stages_keys = array_keys($stages); + $this->stages = array_values($stages); } public function read(): bool { @@ -679,6 +707,7 @@ public function read(): bool { $anyDataPiped = false; $stages = $this->stages; + $this->context = new PipeContext(); for ( $i = 0; $i < count( $stages ) - 1; $i ++ ) { $stage = $stages[ $i ]; @@ -707,6 +736,8 @@ public function read(): bool { break; } + $this->context[$this->stages_keys[$i]] = $stage->get_metadata(); + $anyDataPiped = true; $nextStage = $stages[ $i + 1 ]; if ( ! $nextStage->write( $data, $stage->get_metadata() ) ) { @@ -716,9 +747,11 @@ public function read(): bool { } } - $last_stage = $stages[ count( $stages ) - 1 ]; + $last_stage_idx = count( $stages ) - 1; + $last_stage = $stages[ $last_stage_idx ]; if ( $last_stage instanceof ReadableStream && $last_stage->read() ) { $this->dataBuffer .= $last_stage->consume_output(); + $this->context[$this->stages_keys[$last_stage_idx]] = $last_stage->get_metadata(); if ( $last_stage->is_finished() ) { $this->finished = true; } @@ -734,6 +767,11 @@ public function read(): bool { return false; } + public function get_context() + { + return $this->context; + } + public function write( string $data, ?StreamMetadata $metadata = null ): bool { return $this->stages[0]->write( $data, $metadata ); } @@ -759,6 +797,33 @@ public function get_error(): ?string { } } +class PipeContext implements ArrayAccess { + private $context = []; + + public function offsetExists($offset): bool { + return isset($this->context[$offset]); + } + + public function offsetGet($offset): mixed { + return $this->context[$offset] ?? null; + } + + public function offsetSet($offset, $value): void { + $this->context[$offset] = $value; + } + + public function offsetUnset($offset): void { + unset($this->context[$offset]); + } + + public function skip() + { + + } + + +} + class BlockMarkupURLRewriteStream extends BlockMarkupURLVisitorStream { private $from_url; @@ -867,3 +932,29 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { return false; }; + + +function composeIterators(array $iterators): Generator { + if (empty($iterators)) { + throw new InvalidArgumentException("Iterator list cannot be empty"); + } + + // Internal recursive function to handle the composition + function generatorCompose(array $iterators, int $index): Generator { + if ($index >= count($iterators)) { + return; + } + + foreach ($iterators[$index] as $value) { + if ($index == count($iterators) - 1) { + yield $value; + } else { + foreach (generatorCompose($iterators, $index + 1) as $innerValue) { + yield $innerValue; + } + } + } + } + + return generatorCompose($iterators, 0); +} diff --git a/rewrite-remote-wxr.php b/rewrite-remote-wxr.php index fae7b20..9e4c0d7 100644 --- a/rewrite-remote-wxr.php +++ b/rewrite-remote-wxr.php @@ -53,7 +53,7 @@ $rewrite_links_in_wxr_node = function (WP_XML_Processor $processor) { if (is_wxr_content_node($processor)) { $text = $processor->get_modifiable_text(); - $updated_text = Pipe::run([ + $updated_text = Pipe::get_output([ new BlockMarkupURLRewriteStream( $text, [ @@ -68,8 +68,10 @@ } }; +// @TODO: Implement the commented out parts + $pipe = Pipe::from( [ - HttpClient::stream( [ + 'http' => HttpClient::stream( [ new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini' ), new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini' ), new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/phpcs.xml' ), @@ -77,48 +79,31 @@ new Request( 'https://raw.githubusercontent.com/WordPress/blueprints/trunk/blueprints/stylish-press/site-content.wxr' ), ] ), XML_Processor::stream( $rewrite_links_in_wxr_node ), - // LocalFileWriter::stream( fn ($context) => __DIR__ . '/output/' . $context->get_resource_id() . '.chunk' ), + // function ($context) { + // if ( ! str_ends_with( $zip->filename, '.wxr' ) ) { + // $zip->skip_file(); + // } + // }, + 'file' => LocalFileWriter::stream( fn ($context) => __DIR__ . '/output/' . $context->get_resource_id() . '.chunk' ), ] ); -foreach($pipe as $context) { - var_dump($context); -} - +// var_dump($pipe); +foreach($pipe as $context) { + list( 'http' => $http, 'file' => $file ) = $context; + // print_r($http); + print_r($file); -// while ( $pipe->next() ) { -// list( 'http' => $http, 'zip' => $zip ) = $pipe->get_context(); - -// if ( ! str_ends_with( $zip->get_filename(), '.wxr' ) ) { -// $zip->skip_file(); -// continue; -// } + // if ( $context->is_failure() ) { + // echo 'Failed to download ' . $http->url . ': ' . $context->get_error_message(); + // continue; + // } -// switch( $zip->get_filename() ) { -// case 'site-content.wxr': -// $pipe->write( $xml->get_contents() ); -// break; -// } -// } + // // if ( ! str_ends_with( $zip->filename, '.wxr' ) ) { + // // $zip->skip_file(); + // // continue; + // // } -// Pipe::run( [ -// 'http' => new RequestStream( [ /* ... */ ] ), -// 'zip' => new ZipReaderStream( fn ($context) => { -// if(!str_ends_with($context['http']->url, '.zip')) { -// return $context['zip']->skip(); -// } -// } ), -// 'xml' => new XMLProcessorStream(fn ($context) => { -// if(!str_ends_with($context['zip']->filename, '.wxr')) { -// return $context['zip']->skip(); -// } - -// $xml_processor = $context['xml']->get_or_create_processor( $context['zip']->filename ); -// if(!WXR_Processor::is_content_node($xml_processor)) { -// continue; -// } - -// // Migrate URLs and downlaod assets -// }), -// ] ); + // echo 'Saved ' . $http->url . ' to ' . $file->file_path; +} From 9143dc04c2be3e815c8212a765f1c318afedb152 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 21 Jul 2024 16:21:43 +0200 Subject: [PATCH 11/72] Implement context hierarchy and skipping upstream files --- pipes.php | 291 +++++++++++++++++++++++++++-------------- rewrite-remote-wxr.php | 48 ++++--- 2 files changed, 220 insertions(+), 119 deletions(-) diff --git a/pipes.php b/pipes.php index 7e1b37c..107d675 100644 --- a/pipes.php +++ b/pipes.php @@ -8,7 +8,8 @@ public function read(): bool; public function is_finished(): bool; public function consume_output(): ?string; public function get_error(): ?string; - public function get_metadata(): ?StreamMetadata; + public function get_context(): ?StreamedResourceContext; + public function on_last_read_file_skipped(); } trait ReadableStreamIterator @@ -22,7 +23,7 @@ public function current(): mixed { } return (object) [ 'bytes' => $this->iterator_output_cache, - 'metadata' => $this->get_metadata(), + 'metadata' => $this->get_context(), ]; } @@ -59,14 +60,24 @@ trait BaseReadableStream { protected $finished = false; protected $error = null; protected $buffer = ''; - protected $metadata = null; + protected $context = null; + protected $skipped_resource_id; public function read(): bool { if ( $this->finished || $this->error ) { return false; } - return $this->doRead(); + $result = $this->doRead(); + if( + $result && + $this->context && + $this->context->get_resource_id() === $this->skipped_resource_id + ) { + $this->consume_output(); + return false; + } + return $result; } abstract protected function doRead(): bool; @@ -75,9 +86,15 @@ public function is_finished(): bool { return $this->finished; } - public function get_metadata(): ?StreamMetadata + public function on_last_read_file_skipped() { + if ($this->context && $this->context->get_resource_id()) { + $this->skipped_resource_id = $this->context->get_resource_id(); + } + } + + public function get_context(): ?StreamedResourceContext { - return $this->metadata; + return $this->context; } public function consume_output(): ?string { @@ -94,7 +111,7 @@ public function consume_output(): ?string { protected function set_error( string $error ) { $this->error = $error ?: 'unknown error'; $this->finished = true; - $this->metadata = null; + $this->context = null; } public function get_error(): ?string { @@ -103,7 +120,7 @@ public function get_error(): ?string { } interface WritableStream { - public function write( string $data, ?StreamMetadata $metadata=null ): bool; + public function write( string $data, ?StreamedResourceContext $context=null ): bool; public function get_error(): ?string; } @@ -120,21 +137,20 @@ trait BaseTransformStream { trait BaseWritableStream { protected $error = null; - protected $metadata = null; + protected $context = null; - public function write( string $data, ?StreamMetadata $metadata=null ): bool { + public function write( string $data, ?StreamedResourceContext $pipe_context=null ): bool { if ( $this->error ) { return false; } - $this->metadata = $metadata; - return $this->doWrite( $data, $metadata ); + return $this->doWrite( $data, $pipe_context ); } - abstract protected function doWrite( string $data, ?StreamMetadata $metadata ): bool; + abstract protected function doWrite( string $data, ?StreamedResourceContext $context ): bool; protected function set_error( string $error ) { - $this->metadata = null; + $this->context = null; $this->error = $error ?: 'unknown error'; $this->finished = true; } @@ -148,7 +164,7 @@ class BufferStream implements TransformStream { use BaseTransformStream; - protected function doWrite( string $data, ?StreamMetadata $metadata=null ): bool { + protected function doWrite( string $data, ?StreamedResourceContext $context=null ): bool { $this->buffer .= $data; return true; @@ -211,7 +227,7 @@ public function __construct( $node_visitor_callback ) { $this->node_visitor_callback = $node_visitor_callback; } - protected function doWrite( string $data, ?StreamMetadata $metadata=null ): bool { + protected function doWrite( string $data, ?StreamedResourceContext $context=null ): bool { $this->xml_processor->stream_append_xml( $data ); return true; @@ -238,7 +254,9 @@ protected function doRead(): bool { if ( $tokens_found > 0 ) { $this->buffer .= $processor->get_updated_xml(); - } else { + } + + if ( $tokens_found === 0 || ! $processor->paused_at_incomplete_token() ) { $this->buffer .= $processor->get_unprocessed_xml(); $this->finished = true; } @@ -258,15 +276,15 @@ public function __construct( $pipe_factory ) { $this->pipe_factory = $pipe_factory; } - protected function doWrite( string $data, ?StreamMetadata $metadata=null ): bool { + protected function doWrite( string $data, ?StreamedResourceContext $pipe_context=null ): bool { // -1 is the default stream ID used whenever we don't have any metadata - $stream_id = $metadata ? $metadata->get_resource_id() : -1; + $stream_id = $pipe_context ? $pipe_context->get_resource_id() : -1; if ( ! isset( $this->pipes[ $stream_id ] ) ) { $pipe_factory = $this->pipe_factory; $this->pipes[ $stream_id ] = $pipe_factory(); } - return $this->pipes[ $stream_id ]->write( $data, $metadata ); + return $this->pipes[ $stream_id ]->write( $data, $pipe_context ); } protected function doRead(): bool { @@ -301,7 +319,7 @@ protected function doRead(): bool { } $this->buffer .= $pipe->consume_output(); - $this->metadata = $pipe->get_metadata(); + $this->context = $pipe->get_context(); return true; } @@ -321,7 +339,8 @@ class RequestStream implements ReadableStream { private $client; private $requests = []; - private $requests_metadata = []; + private $child_contexts = []; + private $skipped_requests = []; public function __construct( $requests ) { $this->client = new Client(); @@ -329,7 +348,8 @@ public function __construct( $requests ) { $this->requests = $requests; foreach($requests as $request) { - $this->requests_metadata[$request->id] = new BasicStreamMetadata( + $this->child_contexts[$request->id] = new StreamedResourceContext( + $this, $request->id, $request->url ); @@ -344,7 +364,11 @@ protected function doRead(): bool { } $request = $this->client->get_request(); - $this->metadata = $this->requests_metadata[$request->id]; + if(array_key_exists($request->id, $this->skipped_requests)) { + return false; + } + + $this->context = $this->child_contexts[$request->id]; switch ( $this->client->get_event() ) { case Client::EVENT_BODY_CHUNK_AVAILABLE: $this->buffer .= $this->client->get_response_body_chunk(); @@ -362,7 +386,13 @@ protected function doRead(): bool { return false; } - + + public function on_last_read_file_skipped() + { + if ($this->get_context() && $this->get_context()->get_resource_id()) { + $this->skipped_requests[$this->get_context()->get_resource_id()] = true; + } + } } abstract class StringTransformerStream implements TransformStream { @@ -372,7 +402,7 @@ protected function doRead(): bool { return ! empty( $this->buffer ); } - protected function doWrite( string $data, ?StreamMetadata $metadata=null ): bool { + protected function doWrite( string $data, ?StreamedResourceContext $context=null ): bool { $this->buffer .= $this->transform( $data ); return true; @@ -381,6 +411,35 @@ protected function doWrite( string $data, ?StreamMetadata $metadata=null ): bool abstract protected function transform(string $data): ?string; } + +class CallbackStream implements TransformStream { + use BaseTransformStream; + + private $callback; + public function __construct($callback) { + $this->callback = $callback; + } + + protected function doRead(): bool { + return ! empty( $this->buffer ); + } + + protected function doWrite( string $chunk, ?StreamedResourceContext $pipe_context=null ): bool { + $callback = $this->callback; + $result = $callback( $chunk, $pipe_context ); + if(null === $result) { + // skip this chunk + } else if(!is_string($result)) { + $this->set_error("Invalid chunk emitted by CallbackStream's callback (type: ".gettype($result).")"); + return false; + } else { + $this->buffer .= $chunk; + } + return true; + } + +} + class UppercaseTransformer extends StringTransformerStream { protected function transform( string $data ): ?string { return strtoupper( $data ); @@ -413,13 +472,13 @@ protected function doRead(): bool { return ! empty( $this->buffer ); } - protected function doWrite( string $data, ?StreamMetadata $metadata=null ): bool { + protected function doWrite( string $data, ?StreamedResourceContext $context=null ): bool { $filter_callback = $this->filter_callback; - if ( $filter_callback( $metadata ) ) { + if ( $filter_callback( $context ) ) { $this->buffer .= $data; } else { $this->buffer = ''; - $this->metadata = null; + $this->context = null; } return true; } @@ -430,7 +489,7 @@ class LocalFileWriter implements WritableStream, ReadableStream { private $filename_factory; private $last_written_chunk; private $buffer; - private $metadata; + private $context; private $fp; use ReadableStreamIterator; @@ -446,11 +505,11 @@ public function __construct($filename_factory) $this->filename_factory = $filename_factory; } - public function write( string $data, ?StreamMetadata $metadata=null ): bool { + public function write( string $data, ?StreamedResourceContext $context=null ): bool { if ( ! $this->fp ) { $filename_factory = $this->filename_factory; - $filename = $filename_factory($metadata); - $this->metadata = new BasicStreamMetadata($filename, $filename); + $filename = $filename_factory($context); + $this->context = new StreamedResourceContext($this, $filename, $filename); // @TODO: we'll need to close this. We could use a close() or cleanup() method here. $this->fp = fopen($filename, 'wb'); } @@ -478,6 +537,11 @@ public function is_finished(): bool { return false; } + public function on_last_read_file_skipped() + { + // Nothing to do + } + public function consume_output(): ?string { if($this->buffer) { $chunk = $this->buffer; @@ -487,38 +551,9 @@ public function consume_output(): ?string { return null; } - public function get_metadata(): ?StreamMetadata - { - return $this->metadata; - } -} - -/** - * Extend this class when more metadata is needed. - */ -interface StreamMetadata { - public function get_resource_id(); - public function get_filename(); -} - -class BasicStreamMetadata implements StreamMetadata { - private $resource_id; - private $filename; - - public function __construct($resource_id, $filename=null) + public function get_context(): ?StreamedResourceContext { - $this->resource_id = $resource_id; - $this->filename = $filename; - } - - public function get_resource_id() - { - return $this->resource_id; - } - - public function get_filename() - { - return $this->filename; + return $this->context; } } @@ -536,18 +571,18 @@ public function __construct( $this->factory_function = $factory_function; } - public function write( string $data, ?StreamMetadata $metadata=null ): bool { + public function write( string $data, ?StreamedResourceContext $pipe_context=null ): bool { if ( $this->error ) { return false; } - $resource_id = $metadata ? $metadata->get_resource_id() : 'default'; + $resource_id = $pipe_context ? $pipe_context->get_resource_id() : 'default'; $stream_factory = $this->factory_function; if(!isset($this->stream_instances[$resource_id])) { $this->stream_instances[$resource_id] = $stream_factory(); } $stream = $this->stream_instances[$resource_id]; - $retval = $stream->write( $data, $metadata ); + $retval = $stream->write( $data, $pipe_context ); if ( ! $retval ) { $this->error = $stream->get_error(); } @@ -604,9 +639,9 @@ public function consume_output(): ?string { return $this->last_read_stream ? $this->last_read_stream->consume_output() : null; } - public function get_metadata(): ?StreamMetadata + public function get_context(): ?StreamedResourceContext { - return $this->last_read_stream ? $this->last_read_stream->get_metadata() : null; + return $this->last_read_stream ? $this->last_read_stream->get_context() : null; } public function is_finished(): bool { @@ -616,7 +651,7 @@ public function is_finished(): bool { protected $error = null; protected function set_error( string $error ) { - $this->metadata = null; + $this->context = null; $this->error = $error ?: 'unknown error'; $this->finished = true; } @@ -625,6 +660,11 @@ public function get_error(): ?string { return $this->error; } + public function on_last_read_file_skipped() { + if($this->get_context()) { + $this->get_context()->get_stream()->on_last_read_file_skipped(); + } + } } @@ -633,6 +673,7 @@ class Pipe implements ReadableStream, WritableStream { private $stages = []; private $error = null; private $context = null; + private $last_read_from_stage = null; private $finished = false; private $dataBuffer = ''; @@ -648,10 +689,6 @@ public function current(): mixed { $this->iterator_output_cache = $this->consume_output(); } return $this->context; - // (object) [ - // 'bytes' => $this->iterator_output_cache, - // 'metadata' => $this->get_metadata(), - // ]; } static public function run($stages, $options=array()) @@ -679,6 +716,14 @@ static public function from( $stages ) { throw new \InvalidArgumentException( 'Pipe must have at least one stage' ); } + // Shorthand syntax support, use a callback as one of + // the pipe components. + foreach($stages as $k => $v) { + if(is_callable($v)) { + $stages[$k] = new CallbackStream($v); + } + } + $stages_values = array_values($stages); for ( $i = 0; $i < count( $stages_values ) - 1; $i ++ ) { if ( ! $stages_values[ $i ] instanceof ReadableStream ) { @@ -700,6 +745,7 @@ private function __construct( $stages ) { $this->stages = array_values($stages); } + private $context_history = []; public function read(): bool { if($this->finished) { return false; @@ -707,9 +753,11 @@ public function read(): bool { $anyDataPiped = false; $stages = $this->stages; - $this->context = new PipeContext(); + $this->context = new StreamedResourceContext($this); + $this->last_read_from_stage = null; for ( $i = 0; $i < count( $stages ) - 1; $i ++ ) { $stage = $stages[ $i ]; + $this->last_read_from_stage = $i; $data = $stage->consume_output(); if ( null === $data ) { @@ -730,19 +778,21 @@ public function read(): bool { continue; } $data = $stage->consume_output(); + if ( null === $data ) { + break; + } } - if ( null === $data ) { - break; + $child_context = $stage->get_context(); + if (null !== $child_context) { + $this->context[$this->stages_keys[$i]] = $child_context; } - $this->context[$this->stages_keys[$i]] = $stage->get_metadata(); - $anyDataPiped = true; $nextStage = $stages[ $i + 1 ]; - if ( ! $nextStage->write( $data, $stage->get_metadata() ) ) { + if ( ! $nextStage->write( $data, $this->context ) ) { $this->error = $nextStage->get_error(); - $this->is_finished = true; + $this->finished = true; break; } } @@ -751,7 +801,7 @@ public function read(): bool { $last_stage = $stages[ $last_stage_idx ]; if ( $last_stage instanceof ReadableStream && $last_stage->read() ) { $this->dataBuffer .= $last_stage->consume_output(); - $this->context[$this->stages_keys[$last_stage_idx]] = $last_stage->get_metadata(); + $this->context[$this->stages_keys[$last_stage_idx]] = $last_stage->get_context(); if ( $last_stage->is_finished() ) { $this->finished = true; } @@ -767,13 +817,13 @@ public function read(): bool { return false; } - public function get_context() + public function get_context(): ?StreamedResourceContext { return $this->context; } - public function write( string $data, ?StreamMetadata $metadata = null ): bool { - return $this->stages[0]->write( $data, $metadata ); + public function write( string $data, ?StreamedResourceContext $pipe_context = null ): bool { + return $this->stages[0]->write( $data, $pipe_context ); } public function consume_output(): ?string { @@ -782,12 +832,7 @@ public function consume_output(): ?string { return $data; } - - public function get_metadata(): ?StreamMetadata - { - return $this->stages[ count( $this->stages ) - 1 ]->get_metadata(); - } - + public function is_finished(): bool { return $this->finished; } @@ -795,33 +840,79 @@ public function is_finished(): bool { public function get_error(): ?string { return $this->error; } + + public function on_last_read_file_skipped() { + if (null !== $this->last_read_from_stage) { + for ($i = $this->last_read_from_stage; $i >= 0; $i--) { + $this->stages[$i]->on_last_read_file_skipped(); + } + } + } } -class PipeContext implements ArrayAccess { - private $context = []; +class StreamedResourceContext implements ArrayAccess { + private $child_contexts = []; + private $data = []; + private $stream; + private $resource_id; + private $filename; + private $is_skipped; + + public function __construct(ReadableStream $stream, $resource_id = null, $filename=null) + { + $this->stream = $stream; + $this->resource_id = $resource_id; + $this->filename = $filename; + } public function offsetExists($offset): bool { - return isset($this->context[$offset]); + return isset($this->child_contexts[$offset]); } public function offsetGet($offset): mixed { - return $this->context[$offset] ?? null; + return $this->child_contexts[$offset] ?? null; } public function offsetSet($offset, $value): void { - $this->context[$offset] = $value; + $this->child_contexts[$offset] = $value; } public function offsetUnset($offset): void { - unset($this->context[$offset]); + unset($this->child_contexts[$offset]); } - public function skip() - { - + public function get_stream() { + return $this->stream; } + public function skip_file() { + $this->is_skipped = true; + $this->stream->on_last_read_file_skipped(); + } + public function get_resource_id() { + if($this->resource_id) { + return $this->resource_id; + } + foreach($this->child_contexts as $context) { + $resource_id = $context->get_resource_id(); + if($resource_id) { + return $resource_id; + } + } + } + + public function get_filename() { + if($this->filename) { + return $this->filename; + } + foreach($this->child_contexts as $context) { + $filename = $context->get_filename(); + if($filename) { + return $filename; + } + } + } } class BlockMarkupURLRewriteStream extends BlockMarkupURLVisitorStream diff --git a/rewrite-remote-wxr.php b/rewrite-remote-wxr.php index 9e4c0d7..0545265 100644 --- a/rewrite-remote-wxr.php +++ b/rewrite-remote-wxr.php @@ -70,29 +70,39 @@ // @TODO: Implement the commented out parts -$pipe = Pipe::from( [ - 'http' => HttpClient::stream( [ - new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini' ), - new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini' ), - new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/phpcs.xml' ), - new Request( 'https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/phpcs.xml?a' ), - new Request( 'https://raw.githubusercontent.com/WordPress/blueprints/trunk/blueprints/stylish-press/site-content.wxr' ), - ] ), - XML_Processor::stream( $rewrite_links_in_wxr_node ), - // function ($context) { - // if ( ! str_ends_with( $zip->filename, '.wxr' ) ) { - // $zip->skip_file(); - // } - // }, +$pipe = Pipe::run([ + 'http' => HttpClient::stream([ + new Request('https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini'), + new Request('https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini'), + new Request('https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/phpcs.xml'), + new Request('https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/phpcs.xml?a'), + new Request('https://raw.githubusercontent.com/WordPress/blueprints/trunk/blueprints/stylish-press/site-content.wxr'), + new Request('https://raw.githubusercontent.com/wpaccessibility/a11y-theme-unit-test/master/a11y-theme-unit-test-data.xml'), + ]), + XML_Processor::stream($rewrite_links_in_wxr_node), + function ($chunk, $context) { + // $context['http'] is guaranteed to be present if there are no + // asynchronous streams between the HttpClient stream and here. + // + // Otherwise, the asynchronous operation may yield new chunks after the + // 'http' stream is finished. + if( ! str_ends_with( $context['http']->get_filename(), '.wxr' ) ) { + $context->skip_file(); + // Don't emit any data + return null; + } + + // Emit unchanged input data + return $chunk; + }, 'file' => LocalFileWriter::stream( fn ($context) => __DIR__ . '/output/' . $context->get_resource_id() . '.chunk' ), ] ); -// var_dump($pipe); -foreach($pipe as $context) { - list( 'http' => $http, 'file' => $file ) = $context; +// foreach($pipe as $context) { +// list( 'http' => $http, 'file' => $file ) = $context; // print_r($http); - print_r($file); + // print_r($file); // if ( $context->is_failure() ) { // echo 'Failed to download ' . $http->url . ': ' . $context->get_error_message(); @@ -105,5 +115,5 @@ // // } // echo 'Saved ' . $http->url . ' to ' . $file->file_path; -} +// } From 1a47f0850c9b07f9da37466d226a24cdaf0c38f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 21 Jul 2024 16:22:30 +0200 Subject: [PATCH 12/72] Use the word "file", not "resource" --- pipes.php | 84 +++++++++++++++++++++--------------------- rewrite-remote-wxr.php | 2 +- 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/pipes.php b/pipes.php index 107d675..9cc2785 100644 --- a/pipes.php +++ b/pipes.php @@ -8,7 +8,7 @@ public function read(): bool; public function is_finished(): bool; public function consume_output(): ?string; public function get_error(): ?string; - public function get_context(): ?StreamedResourceContext; + public function get_context(): ?StreamedFileContext; public function on_last_read_file_skipped(); } @@ -61,7 +61,7 @@ trait BaseReadableStream { protected $error = null; protected $buffer = ''; protected $context = null; - protected $skipped_resource_id; + protected $skipped_file_id; public function read(): bool { if ( $this->finished || $this->error ) { @@ -72,7 +72,7 @@ public function read(): bool { if( $result && $this->context && - $this->context->get_resource_id() === $this->skipped_resource_id + $this->context->get_file_id() === $this->skipped_file_id ) { $this->consume_output(); return false; @@ -87,12 +87,12 @@ public function is_finished(): bool { } public function on_last_read_file_skipped() { - if ($this->context && $this->context->get_resource_id()) { - $this->skipped_resource_id = $this->context->get_resource_id(); + if ($this->context && $this->context->get_file_id()) { + $this->skipped_file_id = $this->context->get_file_id(); } } - public function get_context(): ?StreamedResourceContext + public function get_context(): ?StreamedFileContext { return $this->context; } @@ -120,7 +120,7 @@ public function get_error(): ?string { } interface WritableStream { - public function write( string $data, ?StreamedResourceContext $context=null ): bool; + public function write( string $data, ?StreamedFileContext $context=null ): bool; public function get_error(): ?string; } @@ -139,7 +139,7 @@ trait BaseWritableStream { protected $error = null; protected $context = null; - public function write( string $data, ?StreamedResourceContext $pipe_context=null ): bool { + public function write( string $data, ?StreamedFileContext $pipe_context=null ): bool { if ( $this->error ) { return false; } @@ -147,7 +147,7 @@ public function write( string $data, ?StreamedResourceContext $pipe_context=null return $this->doWrite( $data, $pipe_context ); } - abstract protected function doWrite( string $data, ?StreamedResourceContext $context ): bool; + abstract protected function doWrite( string $data, ?StreamedFileContext $context ): bool; protected function set_error( string $error ) { $this->context = null; @@ -164,7 +164,7 @@ class BufferStream implements TransformStream { use BaseTransformStream; - protected function doWrite( string $data, ?StreamedResourceContext $context=null ): bool { + protected function doWrite( string $data, ?StreamedFileContext $context=null ): bool { $this->buffer .= $data; return true; @@ -227,7 +227,7 @@ public function __construct( $node_visitor_callback ) { $this->node_visitor_callback = $node_visitor_callback; } - protected function doWrite( string $data, ?StreamedResourceContext $context=null ): bool { + protected function doWrite( string $data, ?StreamedFileContext $context=null ): bool { $this->xml_processor->stream_append_xml( $data ); return true; @@ -276,9 +276,9 @@ public function __construct( $pipe_factory ) { $this->pipe_factory = $pipe_factory; } - protected function doWrite( string $data, ?StreamedResourceContext $pipe_context=null ): bool { + protected function doWrite( string $data, ?StreamedFileContext $pipe_context=null ): bool { // -1 is the default stream ID used whenever we don't have any metadata - $stream_id = $pipe_context ? $pipe_context->get_resource_id() : -1; + $stream_id = $pipe_context ? $pipe_context->get_file_id() : -1; if ( ! isset( $this->pipes[ $stream_id ] ) ) { $pipe_factory = $this->pipe_factory; $this->pipes[ $stream_id ] = $pipe_factory(); @@ -348,7 +348,7 @@ public function __construct( $requests ) { $this->requests = $requests; foreach($requests as $request) { - $this->child_contexts[$request->id] = new StreamedResourceContext( + $this->child_contexts[$request->id] = new StreamedFileContext( $this, $request->id, $request->url @@ -379,7 +379,7 @@ protected function doRead(): bool { $this->set_error( $request->error ?: 'unknown error' ); break; case Client::EVENT_FINISHED: - // @TODO: Mark this particular resource as finished without + // @TODO: Mark this particular file as finished without // closing the entire Client stream. break; } @@ -389,8 +389,8 @@ protected function doRead(): bool { public function on_last_read_file_skipped() { - if ($this->get_context() && $this->get_context()->get_resource_id()) { - $this->skipped_requests[$this->get_context()->get_resource_id()] = true; + if ($this->get_context() && $this->get_context()->get_file_id()) { + $this->skipped_requests[$this->get_context()->get_file_id()] = true; } } } @@ -402,7 +402,7 @@ protected function doRead(): bool { return ! empty( $this->buffer ); } - protected function doWrite( string $data, ?StreamedResourceContext $context=null ): bool { + protected function doWrite( string $data, ?StreamedFileContext $context=null ): bool { $this->buffer .= $this->transform( $data ); return true; @@ -424,7 +424,7 @@ protected function doRead(): bool { return ! empty( $this->buffer ); } - protected function doWrite( string $chunk, ?StreamedResourceContext $pipe_context=null ): bool { + protected function doWrite( string $chunk, ?StreamedFileContext $pipe_context=null ): bool { $callback = $this->callback; $result = $callback( $chunk, $pipe_context ); if(null === $result) { @@ -472,7 +472,7 @@ protected function doRead(): bool { return ! empty( $this->buffer ); } - protected function doWrite( string $data, ?StreamedResourceContext $context=null ): bool { + protected function doWrite( string $data, ?StreamedFileContext $context=null ): bool { $filter_callback = $this->filter_callback; if ( $filter_callback( $context ) ) { $this->buffer .= $data; @@ -505,11 +505,11 @@ public function __construct($filename_factory) $this->filename_factory = $filename_factory; } - public function write( string $data, ?StreamedResourceContext $context=null ): bool { + public function write( string $data, ?StreamedFileContext $context=null ): bool { if ( ! $this->fp ) { $filename_factory = $this->filename_factory; $filename = $filename_factory($context); - $this->context = new StreamedResourceContext($this, $filename, $filename); + $this->context = new StreamedFileContext($this, $filename, $filename); // @TODO: we'll need to close this. We could use a close() or cleanup() method here. $this->fp = fopen($filename, 'wb'); } @@ -551,7 +551,7 @@ public function consume_output(): ?string { return null; } - public function get_context(): ?StreamedResourceContext + public function get_context(): ?StreamedFileContext { return $this->context; } @@ -571,17 +571,17 @@ public function __construct( $this->factory_function = $factory_function; } - public function write( string $data, ?StreamedResourceContext $pipe_context=null ): bool { + public function write( string $data, ?StreamedFileContext $pipe_context=null ): bool { if ( $this->error ) { return false; } - $resource_id = $pipe_context ? $pipe_context->get_resource_id() : 'default'; + $file_id = $pipe_context ? $pipe_context->get_file_id() : 'default'; $stream_factory = $this->factory_function; - if(!isset($this->stream_instances[$resource_id])) { - $this->stream_instances[$resource_id] = $stream_factory(); + if(!isset($this->stream_instances[$file_id])) { + $this->stream_instances[$file_id] = $stream_factory(); } - $stream = $this->stream_instances[$resource_id]; + $stream = $this->stream_instances[$file_id]; $retval = $stream->write( $data, $pipe_context ); if ( ! $retval ) { $this->error = $stream->get_error(); @@ -639,7 +639,7 @@ public function consume_output(): ?string { return $this->last_read_stream ? $this->last_read_stream->consume_output() : null; } - public function get_context(): ?StreamedResourceContext + public function get_context(): ?StreamedFileContext { return $this->last_read_stream ? $this->last_read_stream->get_context() : null; } @@ -753,7 +753,7 @@ public function read(): bool { $anyDataPiped = false; $stages = $this->stages; - $this->context = new StreamedResourceContext($this); + $this->context = new StreamedFileContext($this); $this->last_read_from_stage = null; for ( $i = 0; $i < count( $stages ) - 1; $i ++ ) { $stage = $stages[ $i ]; @@ -817,12 +817,12 @@ public function read(): bool { return false; } - public function get_context(): ?StreamedResourceContext + public function get_context(): ?StreamedFileContext { return $this->context; } - public function write( string $data, ?StreamedResourceContext $pipe_context = null ): bool { + public function write( string $data, ?StreamedFileContext $pipe_context = null ): bool { return $this->stages[0]->write( $data, $pipe_context ); } @@ -850,18 +850,18 @@ public function on_last_read_file_skipped() { } } -class StreamedResourceContext implements ArrayAccess { +class StreamedFileContext implements ArrayAccess { private $child_contexts = []; private $data = []; private $stream; - private $resource_id; + private $file_id; private $filename; private $is_skipped; - public function __construct(ReadableStream $stream, $resource_id = null, $filename=null) + public function __construct(ReadableStream $stream, $file_id = null, $filename=null) { $this->stream = $stream; - $this->resource_id = $resource_id; + $this->file_id = $file_id; $this->filename = $filename; } @@ -890,14 +890,14 @@ public function skip_file() { $this->stream->on_last_read_file_skipped(); } - public function get_resource_id() { - if($this->resource_id) { - return $this->resource_id; + public function get_file_id() { + if($this->file_id) { + return $this->file_id; } foreach($this->child_contexts as $context) { - $resource_id = $context->get_resource_id(); - if($resource_id) { - return $resource_id; + $file_id = $context->get_file_id(); + if($file_id) { + return $file_id; } } } diff --git a/rewrite-remote-wxr.php b/rewrite-remote-wxr.php index 0545265..00cfe08 100644 --- a/rewrite-remote-wxr.php +++ b/rewrite-remote-wxr.php @@ -95,7 +95,7 @@ function ($chunk, $context) { // Emit unchanged input data return $chunk; }, - 'file' => LocalFileWriter::stream( fn ($context) => __DIR__ . '/output/' . $context->get_resource_id() . '.chunk' ), + 'file' => LocalFileWriter::stream( fn ($context) => __DIR__ . '/output/' . $context->get_file_id() . '.chunk' ), ] ); From e360d082f74f0b33318c58169bb351d241974df6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 21 Jul 2024 16:23:06 +0200 Subject: [PATCH 13/72] Use consistent file_* naming for file-related methods --- pipes.php | 36 ++++++++++++++++++------------------ rewrite-remote-wxr.php | 4 ++-- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/pipes.php b/pipes.php index 9cc2785..7ba9730 100644 --- a/pipes.php +++ b/pipes.php @@ -486,7 +486,7 @@ protected function doWrite( string $data, ?StreamedFileContext $context=null ): class LocalFileWriter implements WritableStream, ReadableStream { private $error = null; - private $filename_factory; + private $file_name_factory; private $last_written_chunk; private $buffer; private $context; @@ -494,24 +494,24 @@ class LocalFileWriter implements WritableStream, ReadableStream { use ReadableStreamIterator; - static public function stream( $filename_factory ) { + static public function stream( $file_name_factory ) { return new Demultiplexer( - fn() => new self( $filename_factory ) + fn() => new self( $file_name_factory ) ); } - public function __construct($filename_factory) + public function __construct($file_name_factory) { - $this->filename_factory = $filename_factory; + $this->file_name_factory = $file_name_factory; } public function write( string $data, ?StreamedFileContext $context=null ): bool { if ( ! $this->fp ) { - $filename_factory = $this->filename_factory; - $filename = $filename_factory($context); - $this->context = new StreamedFileContext($this, $filename, $filename); + $file_name_factory = $this->file_name_factory; + $file_name = $file_name_factory($context); + $this->context = new StreamedFileContext($this, $file_name, $file_name); // @TODO: we'll need to close this. We could use a close() or cleanup() method here. - $this->fp = fopen($filename, 'wb'); + $this->fp = fopen($file_name, 'wb'); } $this->last_written_chunk = $data; @@ -855,14 +855,14 @@ class StreamedFileContext implements ArrayAccess { private $data = []; private $stream; private $file_id; - private $filename; + private $file_name; private $is_skipped; - public function __construct(ReadableStream $stream, $file_id = null, $filename=null) + public function __construct(ReadableStream $stream, $file_id = null, $file_name=null) { $this->stream = $stream; $this->file_id = $file_id; - $this->filename = $filename; + $this->file_name = $file_name; } public function offsetExists($offset): bool { @@ -902,14 +902,14 @@ public function get_file_id() { } } - public function get_filename() { - if($this->filename) { - return $this->filename; + public function get_file_name() { + if($this->file_name) { + return $this->file_name; } foreach($this->child_contexts as $context) { - $filename = $context->get_filename(); - if($filename) { - return $filename; + $file_name = $context->get_file_name(); + if($file_name) { + return $file_name; } } } diff --git a/rewrite-remote-wxr.php b/rewrite-remote-wxr.php index 00cfe08..a734d65 100644 --- a/rewrite-remote-wxr.php +++ b/rewrite-remote-wxr.php @@ -86,7 +86,7 @@ function ($chunk, $context) { // // Otherwise, the asynchronous operation may yield new chunks after the // 'http' stream is finished. - if( ! str_ends_with( $context['http']->get_filename(), '.wxr' ) ) { + if( ! str_ends_with( $context['http']->get_file_name(), '.wxr' ) ) { $context->skip_file(); // Don't emit any data return null; @@ -109,7 +109,7 @@ function ($chunk, $context) { // continue; // } - // // if ( ! str_ends_with( $zip->filename, '.wxr' ) ) { + // // if ( ! str_ends_with( $zip->file_name, '.wxr' ) ) { // // $zip->skip_file(); // // continue; // // } From ceceac52084c32791fda2a734abc5051c180bb47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 21 Jul 2024 23:06:16 +0200 Subject: [PATCH 14/72] Explore Unix-like stdin, stderr, stdout-based piping approach. I'm hoping for a simpler code structure and clearer data flows. --- pipes-unix.php | 207 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 pipes-unix.php diff --git a/pipes-unix.php b/pipes-unix.php new file mode 100644 index 0000000..2a641ea --- /dev/null +++ b/pipes-unix.php @@ -0,0 +1,207 @@ +stdin = $stdin ?? new UnixPipe(); + $process->stdout = $stdout ?? new UnixPipe(); + $process->stderr = $stderr ?? new UnixPipe(); + $process->pid = self::$last_pid++; + $process->init(); + self::$process_table[$process->pid] = $process; + return $process; + } + + static public function kill($pid, $code) { + self::$process_table[$pid]->kill($code); + } + + static public function reap($pid) { + self::$reaped_pids[] = $pid; + self::$process_table[$pid]->cleanup(); + unset(self::$process_table[$pid]); + } + + static public function is_reaped($pid) { + return in_array($pid, self::$reaped_pids); + } + +} + +abstract class Process { + public ?int $exit_code = null; + public UnixPipe $stdin; + public UnixPipe $stdout; + public UnixPipe $stderr; + public $pid; + + abstract public function tick($tick_context); + + public function kill($code) { + $this->exit_code = $code; + $this->stdin->close(); + $this->stdout->close(); + $this->stderr->close(); + } + + public function has_crashed() { + return $this->exit_code !== null && $this->exit_code !== 0; + } + + public function is_alive() { + return $this->exit_code === null; + } + + public function init() { + // initialize resources + } + + public function cleanup() { + // clean up resources + } +} + +class UnixPipe { + public string $buffer = ''; + public $metadata = null; + private bool $closed = false; + + public function read() { + $buffer = $this->buffer; + if(!$buffer && $this->closed) { + return false; + } + $this->buffer = ''; + return $buffer; + } + + public function get_metadata() { + return $this->metadata; + } + + public function write(string $data, $metadata=null) { + if($this->closed) { + return false; + } + $this->buffer .= $data; + $this->metadata = $metadata; + } + + public function is_eof() { + return '' === $this->buffer && $this->closed; + } + + public function close() { + $this->closed = true; + } +} + +class HelloWorld extends Process { + public function tick($tick_context) { + $this->stdout->write("Hello, world!", [ + 'file_id' => 1, + ]); + $this->stderr->write("Critical error has occured :("); + $this->kill(1); + } +} + +class Uppercaser extends Process { + public function tick($tick_context) { + if($this->stdin->is_eof()) { + $this->stdout->write('Final chunk'); + $this->kill(0); + return; + } + + $data = $this->stdin->read(); + if ($data) { + $this->stdout->write(strtoupper($data)); + } + } +} + +class Composite extends Process { + public array $process_factories; + public $subprocesses = []; + private $reaped_pids = []; + + public function __construct($process_factories) { + $this->process_factories = $process_factories; + } + + public function init() { + $last_process = null; + $names = array_keys($this->process_factories); + $processes = array_values($this->process_factories); + for($i = 0; $i < count($this->process_factories); $i++) { + if(null === $last_process) { + $stdin = $this->stdin; + } else { + $stdin = $last_process->stdout; + } + if($i === count($this->process_factories) - 1) { + $stdout = $this->stdout; + } else { + $stdout = null; + } + $subprocess = ProcessManager::spawn( + $processes[$i], + $stdin, + $stdout + ); + $this->subprocesses[$names[$i]] = $subprocess; + $last_process = $subprocess; + } + } + + public function tick($tick_context) { + $this->stdout->metadata = null; + foreach ($this->subprocesses as $name => $process) { + $process->tick($tick_context); + if($process->has_crashed()) { + if (!ProcessManager::is_reaped($process->pid)) { + ProcessManager::reap($process->pid); + $this->stderr->write("Process $name has crashed with code {$process->exit_code}", [ + 'reaped' => true, + 'process' => $name, + 'exit_code' => $process->exit_code, + ]); + return; + } else { + continue; + } + } + $metadata = $process->stdout->get_metadata(); + if (null !== $metadata) { + $tick_context[$name] = $metadata; + } + } + $this->stdout->metadata = $tick_context; + if(!$process->is_alive()) { + $this->kill(0); + } + print_r($this->stdout); + } +} + +$process = ProcessManager::spawn(fn () => new Composite([ + 'hello' => fn() => new HelloWorld(), + 'upper' => fn() => new Uppercaser() +])); + +$process->tick([]); +echo $process->stdout->read(); +// var_dump($process->stdout->get_metadata()); +$process->tick([]); +var_dump($process->stdout->get_metadata()); +var_dump($process->stderr->read()); +$process->tick([]); +echo $process->stdout->read(); +// var_dump($process->stdout->is_eof()); +// var_dump($process->is_alive()); \ No newline at end of file From 855eb149a672c270b8ce5020b000eb99d614d873 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 21 Jul 2024 23:08:06 +0200 Subject: [PATCH 15/72] Rename Composite to ShellCommandsChain to refer to 'cat | sort' --- pipes-unix.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 2a641ea..2d7982e 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -126,7 +126,7 @@ public function tick($tick_context) { } } -class Composite extends Process { +class ShellCommandsChain extends Process { public array $process_factories; public $subprocesses = []; private $reaped_pids = []; @@ -190,7 +190,7 @@ public function tick($tick_context) { } } -$process = ProcessManager::spawn(fn () => new Composite([ +$process = ProcessManager::spawn(fn () => new ShellCommandsChain([ 'hello' => fn() => new HelloWorld(), 'upper' => fn() => new Uppercaser() ])); From 6e4aae55645d337ff0cf2c363dd62402ba9199a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Mon, 22 Jul 2024 10:05:42 +0200 Subject: [PATCH 16/72] Experiment with propagating errors --- pipes.php | 165 ++++++++++++++++++++++++++++++----------- rewrite-remote-wxr.php | 59 ++++++++------- 2 files changed, 153 insertions(+), 71 deletions(-) diff --git a/pipes.php b/pipes.php index 7ba9730..423fd4a 100644 --- a/pipes.php +++ b/pipes.php @@ -64,6 +64,8 @@ trait BaseReadableStream { protected $skipped_file_id; public function read(): bool { + $this->context = null; + if ( $this->finished || $this->error ) { return false; } @@ -111,7 +113,6 @@ public function consume_output(): ?string { protected function set_error( string $error ) { $this->error = $error ?: 'unknown error'; $this->finished = true; - $this->context = null; } public function get_error(): ?string { @@ -279,6 +280,7 @@ public function __construct( $pipe_factory ) { protected function doWrite( string $data, ?StreamedFileContext $pipe_context=null ): bool { // -1 is the default stream ID used whenever we don't have any metadata $stream_id = $pipe_context ? $pipe_context->get_file_id() : -1; + if ( ! isset( $this->pipes[ $stream_id ] ) ) { $pipe_factory = $this->pipe_factory; $this->pipes[ $stream_id ] = $pipe_factory(); @@ -287,6 +289,22 @@ protected function doWrite( string $data, ?StreamedFileContext $pipe_context=nul return $this->pipes[ $stream_id ]->write( $data, $pipe_context ); } + public function on_upstream_error(StreamedFileContext $pipe_context) + { + $stream_id = $pipe_context ? $pipe_context->get_file_id() : -1; + if(!$pipe_context->get_error()) { + return; + } + // Clean up the child stream if needed + if (!isset($this->pipes[$stream_id])) { + return; + } + + $pipe = $this->pipes[$stream_id]; + $pipe->on_upstream_error($pipe_context); + $this->context = $pipe->get_context(); + } + protected function doRead(): bool { if(empty($this->next_read)) { $this->next_read = array_keys($this->pipes); @@ -341,6 +359,7 @@ class RequestStream implements ReadableStream { private $requests = []; private $child_contexts = []; private $skipped_requests = []; + private $errors = []; public function __construct( $requests ) { $this->client = new Client(); @@ -374,18 +393,42 @@ protected function doRead(): bool { $this->buffer .= $this->client->get_response_body_chunk(); return true; case Client::EVENT_FAILED: - // @TODO: Handling errors. - // We don't want to stop everything if one request fails. - $this->set_error( $request->error ?: 'unknown error' ); - break; + $this->context->skip_file(); + $this->set_child_error($request->error); + return false; case Client::EVENT_FINISHED: - // @TODO: Mark this particular file as finished without - // closing the entire Client stream. - break; + $this->set_child_finished(); + return false; } return false; } + + protected function set_child_finished() + { + $this->context->set_finished(); + + foreach( $this->child_contexts as $context ) { + if(!$context->is_finished()) { + return; + } + } + + $this->finished = true; + } + + protected function set_child_error($error) + { + $this->context->set_error($error); + + foreach( $this->child_contexts as $context ) { + if(!$context->get_error()) { + return; + } + } + + $this->set_error('All child requests failed'); + } public function on_last_read_file_skipped() { @@ -525,6 +568,7 @@ public function get_error(): ?string { // Temporary workaround to keep the Pipe class working public function read(): bool { + $this->context = null; if($this->last_written_chunk) { $this->buffer = $this->last_written_chunk; $this->last_written_chunk = null; @@ -572,21 +616,17 @@ public function __construct( } public function write( string $data, ?StreamedFileContext $pipe_context=null ): bool { - if ( $this->error ) { - return false; - } - $file_id = $pipe_context ? $pipe_context->get_file_id() : 'default'; $stream_factory = $this->factory_function; if(!isset($this->stream_instances[$file_id])) { $this->stream_instances[$file_id] = $stream_factory(); } $stream = $this->stream_instances[$file_id]; - $retval = $stream->write( $data, $pipe_context ); - if ( ! $retval ) { - $this->error = $stream->get_error(); - } - return $retval; + + // We don't check whether the write succeeded. + // The child streams handle their own errors. + $stream->write( $data, $pipe_context ); + return true; } private $read_queue = []; @@ -612,22 +652,21 @@ public function read(): bool } $stream = array_shift($this->read_queue); - if ( $stream->is_finished() ) { - $index = array_search($stream, $this->stream_instances, true); - unset($this->stream_instances[$index]); - continue; - } - if ($stream->read()) { $this->last_read_stream = $stream; return true; } - if ( $stream->get_error() ) { - $this->error = $stream->get_error(); - $this->is_finished = true; + if ($stream->is_finished()) { + $index = array_search($stream, $this->stream_instances, true); + unset($this->stream_instances[$index]); - return false; + // Don't do anything when the child stream errors out. + // We may soon receive more files to demultiplex so we + // don't want to trash the entire Demultiplexer stream. + // + // Error details are available in the chunk context. + continue; } ++$processed_streams; @@ -648,16 +687,19 @@ public function is_finished(): bool { return count($this->stream_instances) === 0; } - protected $error = null; - - protected function set_error( string $error ) { - $this->context = null; - $this->error = $error ?: 'unknown error'; - $this->finished = true; - } - + /** + * Demultiplexer does not have an error state even if + * all of the child streams error out. This is because + * we don't know upfront whether more files will be streamed + * to the demultiplexer later. + * + * Demultiplexed streams have their own error states and + * the error details are available in the streamed file context. + * + * @return null + */ public function get_error(): ?string { - return $this->error; + return null; } public function on_last_read_file_skipped() { @@ -668,7 +710,7 @@ public function on_last_read_file_skipped() { } -class Pipe implements ReadableStream, WritableStream { +class UnixPipe implements ReadableStream, WritableStream { private $stages_keys = []; private $stages = []; private $error = null; @@ -696,7 +738,7 @@ static public function run($stages, $options=array()) $options = array_merge([ 'buffer_output' => false, ], $options); - $pipe = Pipe::from( $stages ); + $pipe = UnixPipe::from( $stages ); while ( ! $pipe->is_finished() ) { if ( ! $pipe->read() ) { @@ -747,6 +789,8 @@ private function __construct( $stages ) { private $context_history = []; public function read(): bool { + $this->context = null; + if($this->finished) { return false; } @@ -755,6 +799,7 @@ public function read(): bool { $stages = $this->stages; $this->context = new StreamedFileContext($this); $this->last_read_from_stage = null; + $mode = 'pipe_data'; // or pipe_error for ( $i = 0; $i < count( $stages ) - 1; $i ++ ) { $stage = $stages[ $i ]; $this->last_read_from_stage = $i; @@ -764,18 +809,27 @@ public function read(): bool { if ( ! $stage->read() ) { if ( $stage->get_error() ) { $this->error = $stage->get_error(); - $this->is_finished = true; + $this->finished = true; return false; } - if ( $stage->is_finished() ) { + // @TODO pipe the error through the next stages + // so they can clean up resources. + // @TODO separately, output the error from the current pipe + // so that the caller can handle it. + if($stage->get_context() && $stage->get_context()->get_error()) { + // Propagate the error through the next stages + $mode = 'pipe_error'; + } else { + if ($stage->is_finished()) { + continue; + } + + // No data was produced by the stage, let's try again on the next read() call, + // and meanwhile let's see if the rest of the pipe will produce any data. continue; } - - // No data was produced by the stage, let's try again on the next read() call, - // and meanwhile let's see if the rest of the pipe will produce any data. - continue; } $data = $stage->consume_output(); if ( null === $data ) { @@ -857,6 +911,8 @@ class StreamedFileContext implements ArrayAccess { private $file_id; private $file_name; private $is_skipped; + private $is_finished; + private $error; public function __construct(ReadableStream $stream, $file_id = null, $file_name=null) { @@ -885,11 +941,30 @@ public function get_stream() { return $this->stream; } + public function set_error($error) { + $this->error = $error; + } + + public function get_error() { + return $this->error; + } + public function skip_file() { $this->is_skipped = true; + $this->set_finished(); $this->stream->on_last_read_file_skipped(); } + public function is_finished() + { + return $this->is_finished; + } + + public function set_finished() + { + $this->is_finished = true; + } + public function get_file_id() { if($this->file_id) { return $this->file_id; @@ -1049,3 +1124,5 @@ function generatorCompose(array $iterators, int $index): Generator { return generatorCompose($iterators, 0); } + + diff --git a/rewrite-remote-wxr.php b/rewrite-remote-wxr.php index a734d65..a5c5df7 100644 --- a/rewrite-remote-wxr.php +++ b/rewrite-remote-wxr.php @@ -53,7 +53,7 @@ $rewrite_links_in_wxr_node = function (WP_XML_Processor $processor) { if (is_wxr_content_node($processor)) { $text = $processor->get_modifiable_text(); - $updated_text = Pipe::get_output([ + $updated_text = UnixPipe::get_output([ new BlockMarkupURLRewriteStream( $text, [ @@ -70,33 +70,38 @@ // @TODO: Implement the commented out parts -$pipe = Pipe::run([ - 'http' => HttpClient::stream([ - new Request('https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini'), - new Request('https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini'), - new Request('https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/phpcs.xml'), - new Request('https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/phpcs.xml?a'), - new Request('https://raw.githubusercontent.com/WordPress/blueprints/trunk/blueprints/stylish-press/site-content.wxr'), - new Request('https://raw.githubusercontent.com/wpaccessibility/a11y-theme-unit-test/master/a11y-theme-unit-test-data.xml'), - ]), - XML_Processor::stream($rewrite_links_in_wxr_node), - function ($chunk, $context) { - // $context['http'] is guaranteed to be present if there are no - // asynchronous streams between the HttpClient stream and here. - // - // Otherwise, the asynchronous operation may yield new chunks after the - // 'http' stream is finished. - if( ! str_ends_with( $context['http']->get_file_name(), '.wxr' ) ) { - $context->skip_file(); - // Don't emit any data - return null; - } +$pipe = UnixPipe::run( + [ + 'http' => HttpClient::stream([ + new Request('https://raw.githubusercontent.com/WordPress/blueprints-library/trunk/php.ini'), + new Request('https://127.0.0.1:80'), + ]), + XML_Processor::stream($rewrite_links_in_wxr_node), + function ($chunk, $context) { + var_dump(get_class($context)); + // $context['http'] is guaranteed to be present if there are no + // asynchronous streams between the HttpClient stream and here. + // + // Otherwise, the asynchronous operation may yield new chunks after the + // 'http' stream is finished. + if( ! str_ends_with( $context['http']->get_file_name(), '.ini' ) ) { + $context->skip_file(); + // Don't emit any data + return null; + } + + // Emit unchanged input data + return $chunk; + }, + 'file' => LocalFileWriter::stream( fn ($context) => __DIR__ . '/output/' . $context->get_file_id() . '.chunk' ), + ], + [ + // 'error_behavior' => Pipe::BREAK_ON_ERROR | Pipe::CONTINUE_ON_ERROR, + 'on_error' => function ($error) { - // Emit unchanged input data - return $chunk; - }, - 'file' => LocalFileWriter::stream( fn ($context) => __DIR__ . '/output/' . $context->get_file_id() . '.chunk' ), -] ); + }, + ] +); // foreach($pipe as $context) { From 6858ca56667eed4bba8a09c62c12197b3b28a6df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Mon, 22 Jul 2024 16:18:42 +0200 Subject: [PATCH 17/72] Propagate both the bytes and the error details, use the pipe eof state to kill processes --- pipes-unix.php | 422 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 374 insertions(+), 48 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 2d7982e..209f9df 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -8,9 +8,9 @@ class ProcessManager { static public function spawn($factory, $stdin=null, $stdout=null, $stderr=null) { $process = $factory(); - $process->stdin = $stdin ?? new UnixPipe(); - $process->stdout = $stdout ?? new UnixPipe(); - $process->stderr = $stderr ?? new UnixPipe(); + $process->stdin = $stdin ?? new MultiChannelPipe(); + $process->stdout = $stdout ?? new MultiChannelPipe(); + $process->stderr = $stderr ?? new MultiChannelPipe(); $process->pid = self::$last_pid++; $process->init(); self::$process_table[$process->pid] = $process; @@ -35,12 +35,20 @@ static public function is_reaped($pid) { abstract class Process { public ?int $exit_code = null; - public UnixPipe $stdin; - public UnixPipe $stdout; - public UnixPipe $stderr; + public Pipe $stdin; + public Pipe $stdout; + public Pipe $stderr; public $pid; - abstract public function tick($tick_context); + public function tick($tick_context) { + if(!$this->is_alive()) { + return; + } + + return $this->do_tick($tick_context); + } + + abstract protected function do_tick($tick_context); public function kill($code) { $this->exit_code = $code; @@ -58,16 +66,58 @@ public function is_alive() { } public function init() { - // initialize resources } public function cleanup() { // clean up resources } + + protected function set_write_channel(string $name) + { + $this->stderr->set_channel_for_write($name); + $this->stdout->set_channel_for_write($name); + } + + protected function add_output_channel(string $name) + { + $this->stderr->add_channel($name); + $this->stdout->add_channel($name); + } + + protected function close_output_channel(string $name) + { + $this->stderr->close_channel($name); + $this->stdout->close_channel($name); + } } -class UnixPipe { - public string $buffer = ''; +abstract class TransformProcess extends Process { + protected function do_tick($tick_context) { + if($this->stdin->is_eof()) { + $this->kill(0); + return; + } + + $data = $this->stdin->read(); + if (false === $data) { + return; + } + $this->stdout->write($this->transform($data, $tick_context)); + } + + abstract protected function transform($data, $tick_context); + +} + +interface Pipe { + public function read(); + public function write(string $data, $metadata=null); + public function is_eof(); + public function close(); +} + +class UnixPipe implements Pipe { + public ?string $buffer = null; public $metadata = null; private bool $closed = false; @@ -76,7 +126,7 @@ public function read() { if(!$buffer && $this->closed) { return false; } - $this->buffer = ''; + $this->buffer = null; return $buffer; } @@ -88,12 +138,15 @@ public function write(string $data, $metadata=null) { if($this->closed) { return false; } + if(null === $this->buffer) { + $this->buffer = ''; + } $this->buffer .= $data; $this->metadata = $metadata; } public function is_eof() { - return '' === $this->buffer && $this->closed; + return null === $this->buffer && $this->closed; } public function close() { @@ -101,8 +154,212 @@ public function close() { } } +/** + * Idea 1: Use multiple pipes to pass multi-band I/O data between processes. + */ +class MultiChannelPipe implements Pipe { + public $metadata; + private array $channels = []; + private ?string $last_read_channel = 'default'; + private ?string $current_channel = 'default'; + + public function __construct() + { + $this->add_channel('default'); + } + + public function add_channel(string $name, $pipe = null) { + $this->channels[$name] = $pipe ?? new UnixPipe(); + } + + public function read() { + if (empty($this->channels)) { + return false; + } + + $this->metadata = null; + $channels_to_check = $this->next_channels(); + foreach($channels_to_check as $channel_name) { + $data = $this->channels[$channel_name]->read(); + if ($data === false || $data === null) { + continue; + } + $this->last_read_channel = $this->current_channel = $channel_name; + $this->metadata = $this->channels[$channel_name]->get_metadata(); + return $data; + } + + return null; + } + + private function next_channels() { + $channels_queue = []; + $channel_names = array_keys($this->channels); + $last_read_channel_index = array_search($this->last_read_channel, $channel_names); + if(false === $last_read_channel_index) { + $last_read_channel_index = 0; + } else if($last_read_channel_index > count($channel_names)) { + $last_read_channel_index = count($channel_names) - 1; + } + + $this->last_read_channel = null; + for ($i = 1; $i <= count($channel_names); $i++) { + $key_index = ($last_read_channel_index + $i) % count($channel_names); + $channel_name = $channel_names[$key_index]; + if($this->channels[$channel_name]->is_eof()) { + unset($this->channels[$channel_name]); + continue; + } + $this->last_read_channel = $channel_name; + $channels_queue[] = $channel_name; + } + return $channels_queue; + } + + public function get_metadata() { + return $this->metadata; + } + + public function write(string $data, $metadata = null) { + if (!isset($this->channels[$this->current_channel])) { + return false; + } + + $this->channels[$this->current_channel]->write($data, $metadata); + } + + public function close_channel($channel_name) + { + $this->channels[$channel_name]->close(); + $this->current_channel = null; + } + + public function set_channel_for_write($name) + { + $this->current_channel = $name; + } + + public function has_channel($name) + { + return isset($this->channels[$name]); + } + + public function get_current_channel() + { + return $this->current_channel; + } + + public function get_channel_pipe($index) + { + return $this->channels[$index]; + } + + public function is_eof() { + foreach ($this->channels as $pipe) { + if (!$pipe->is_eof()) { + return false; + } + } + return true; + } + + public function close() { + foreach ($this->channels as $pipe) { + $pipe->close(); + } + } +} + +/** + * Idea 2: Use multiple child processes for + * + * We want to keep track of: + * * Stream ID – the sequential byte stream identifier. Multiple streams will produce + * file chunks in an arbitrary order and, when multiplexed, the chunks will be + * interleaved. + * * File ID – the file within that stream. A single stream may contain multiple files, + * but they will always be written sequentially. When multiplexed, one file will + * always be written completely before the next one is started. + * + * When a specific stream errors out, we need to communicate this + * downstream and so the consumer processes can handle the error. + * + * Therefore, we need a separate pipe for each stream ID. Do we also + * need a separate process? Not necessarily. Each process only cares + * about the open-ness or EOF-ness of its input and output pipes, + * not about the actual lifecycle of the other processes. + * + * However, we may want to correlate the same stream ID with stdout and + * stderr streams, in which case intertwining stream ID and process ID + * would be useful. But then we don't have a 1:1 mapping between + * what a data stream does and what a process does. + * + * Let's try these two approach and see where we get with it: + * + * 1. Each process has a multiplexed stdin, stdout, and stderr pipes. + * We do not use non-multiplexed pipes at all. Every process communicates + * "there will be more output to come" by keeping at least one output + * pipe open. Each process makes sure to react to sub-pipe state changes. + * When a read() operation is called and a specific sub-pipe is EOF, + * that process cleans up its sub resources and closes the corresponding + * output sub-pipe. + * 2. Each process has a single input and output pipe. A process + * that produces multiple data stream fakes spawning one child + * process per data stream. The next process gets multiple input + * pipes, but no actual access to the child processes of the first + * process. Then, it may spawn its own child processes. Hm. But that + * just sounds a multi-pipe solution with extra steps. + */ +class FakeHttpClient extends Process +{ + protected const SIDE_EFFECTS = true; + + public function init() + { + $this->close_output_channel('default'); + } + + protected function do_tick($tick_context) + { + static $tick_nb = 0; + if (++$tick_nb === 1) { + $this->add_output_channel('stream_1'); + $this->set_write_channel('stream_1'); + $this->stdout->write("stream-1-chunk-1", [ + 'file_id' => 1, + ]); + + $this->add_output_channel('stream_2'); + $this->set_write_channel('stream_2'); + $this->stdout->write("stream-2-chunk-1!", [ + 'file_id' => 2, + ]); + } else if (++$tick_nb === 2) { + $this->set_write_channel('stream_3'); + $this->stdout->write("stream-3-chunk-1!"); + } else { + $this->set_write_channel('stream_1'); + $this->stdout->write("stream-1-chunk-2", [ + 'file_id' => 1, + ]); + $this->stdout->write("stream-1-chunk-3", [ + 'file_id' => 3, + ]); + + $this->add_output_channel('stream_3'); + $this->set_write_channel('stream_3'); + $this->stdout->write("stream-3-chunk-2!", [ + 'file_id' => 2, + ]); + + $this->kill(0); + } + } +} + + class HelloWorld extends Process { - public function tick($tick_context) { + protected function do_tick($tick_context) { $this->stdout->write("Hello, world!", [ 'file_id' => 1, ]); @@ -111,21 +368,62 @@ public function tick($tick_context) { } } -class Uppercaser extends Process { - public function tick($tick_context) { +class Uppercaser extends TransformProcess { + protected function transform($data, $tick_context) { + return strtoupper($data); + } +} + +class Demultiplexer extends Process { + private $process_factory = []; + public $subprocesses = []; + private $killed_subprocesses = []; + public function __construct($process_factory) { + $this->process_factory = $process_factory; + } + + protected function do_tick($tick_context) { if($this->stdin->is_eof()) { - $this->stdout->write('Final chunk'); $this->kill(0); return; } - $data = $this->stdin->read(); - if ($data) { - $this->stdout->write(strtoupper($data)); + $next_chunk = $this->stdin->read(); + if(false === $next_chunk) { + return; + } + + $input_channel = $this->stdin->get_current_channel(); + if(!isset($this->subprocesses[$input_channel])) { + $this->add_output_channel($input_channel); + $this->subprocesses[$input_channel] = ProcessManager::spawn( + $this->process_factory + ); + } + + $subprocess = $this->subprocesses[$input_channel]; + $subprocess->stdin->write( $next_chunk, $this->stdin->get_metadata() ); + $subprocess->tick($tick_context); + + $output = $subprocess->stdout->read(); + if(null !== $output && false !== $output) { + $this->set_write_channel($input_channel); + $this->stdout->write($output, $subprocess->stdout->get_metadata()); + } + + if (!$subprocess->is_alive()) { + if($subprocess->has_crashed()) { + $this->stderr->write("Subprocess $input_channel has crashed with code {$subprocess->exit_code}", [ + 'type' => 'crash', + 'process' => $subprocess, + ]); + } + $this->close_output_channel($input_channel); } } } + class ShellCommandsChain extends Process { public array $process_factories; public $subprocesses = []; @@ -145,63 +443,91 @@ public function init() { } else { $stdin = $last_process->stdout; } - if($i === count($this->process_factories) - 1) { - $stdout = $this->stdout; - } else { - $stdout = null; - } $subprocess = ProcessManager::spawn( $processes[$i], $stdin, - $stdout + null, + $this->stderr ); $this->subprocesses[$names[$i]] = $subprocess; $last_process = $subprocess; } } - public function tick($tick_context) { - $this->stdout->metadata = null; + protected function do_tick($tick_context) { foreach ($this->subprocesses as $name => $process) { - $process->tick($tick_context); + if ($process->is_alive()) { + $process->tick($tick_context); + } + + if(!$process->stdout->is_eof()) { + $metadata = $process->stdout->get_metadata(); + if (null !== $metadata) { + $tick_context[$name] = $metadata; + } + } + if($process->has_crashed()) { if (!ProcessManager::is_reaped($process->pid)) { ProcessManager::reap($process->pid); $this->stderr->write("Process $name has crashed with code {$process->exit_code}", [ + 'type' => 'crash', + 'process' => $process, 'reaped' => true, - 'process' => $name, - 'exit_code' => $process->exit_code, ]); return; - } else { - continue; } } - $metadata = $process->stdout->get_metadata(); - if (null !== $metadata) { - $tick_context[$name] = $metadata; - } } - $this->stdout->metadata = $tick_context; - if(!$process->is_alive()) { + + $data = $process->stdout->read(); + if(null !== $data && false !== $data) { + $this->stdout->write($data, $tick_context); + } + + if($process->stdout->is_eof()) { $this->kill(0); } - print_r($this->stdout); } } $process = ProcessManager::spawn(fn () => new ShellCommandsChain([ - 'hello' => fn() => new HelloWorld(), - 'upper' => fn() => new Uppercaser() + 'http' => fn() => new FakeHttpClient(), + // 'uc' => fn() => new Uppercaser(), + 'upper' => fn() => new Demultiplexer(fn() => new Uppercaser()) ])); -$process->tick([]); -echo $process->stdout->read(); +$i = 0; + +do { + $process->tick([]); + + $data = $process->stdout->read(); + if(is_string($data)) { + echo 'Data: ' . $data . "\n"; + } + + $error = $process->stderr->read(); + if ($error) { + echo 'Error: ' . $error . "\n"; + $meta = $process->stderr->get_metadata(); + if ($meta['type'] ?? '' === 'crash') { + $child_error = $meta['process']->stderr->read(); + if ($child_error) { + echo 'CRASH: ' . $meta['process']->stderr->read() . "\n"; + } + } + } +} while ($process->is_alive()); + +// $process->tick([]); +// var_dump($process->stdout->read()); + +// var_dump($process->stdout->get_metadata()); +// $process->tick([]); // var_dump($process->stdout->get_metadata()); -$process->tick([]); -var_dump($process->stdout->get_metadata()); -var_dump($process->stderr->read()); -$process->tick([]); -echo $process->stdout->read(); +// var_dump($process->stderr->read()); +// $process->tick([]); +// echo $process->stdout->read(); // var_dump($process->stdout->is_eof()); // var_dump($process->is_alive()); \ No newline at end of file From aab911b70c310046e79c5c30de6b3c9017eb1851 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Mon, 22 Jul 2024 16:24:30 +0200 Subject: [PATCH 18/72] Add a missing null check --- pipes-unix.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 209f9df..c8ccdb6 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -99,7 +99,7 @@ protected function do_tick($tick_context) { } $data = $this->stdin->read(); - if (false === $data) { + if (null === $data || false === $data) { return; } $this->stdout->write($this->transform($data, $tick_context)); @@ -389,7 +389,7 @@ protected function do_tick($tick_context) { } $next_chunk = $this->stdin->read(); - if(false === $next_chunk) { + if(null === $next_chunk || false === $next_chunk) { return; } @@ -493,8 +493,8 @@ protected function do_tick($tick_context) { $process = ProcessManager::spawn(fn () => new ShellCommandsChain([ 'http' => fn() => new FakeHttpClient(), - // 'uc' => fn() => new Uppercaser(), - 'upper' => fn() => new Demultiplexer(fn() => new Uppercaser()) + 'uc' => fn() => new Uppercaser(), + // 'upper' => fn() => new Demultiplexer(fn() => new Uppercaser()) ])); $i = 0; From 1760251950a56a408e7cc0b32cb410ff27d8451b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Mon, 22 Jul 2024 18:31:52 +0200 Subject: [PATCH 19/72] Ability to reassign stdin and stdout --- pipes-unix.php | 112 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 91 insertions(+), 21 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index c8ccdb6..b3e215b 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -114,9 +114,10 @@ public function read(); public function write(string $data, $metadata=null); public function is_eof(); public function close(); + public function get_metadata(); } -class UnixPipe implements Pipe { +class BufferingPipe implements Pipe { public ?string $buffer = null; public $metadata = null; private bool $closed = false; @@ -154,6 +155,64 @@ public function close() { } } +class ResourcePipe implements Pipe { + public $resource; + private bool $closed = false; + + public function __construct($resource) { + $this->resource = $resource; + } + + public function read() { + if($this->closed) { + return false; + } + $data = fread($this->resource, 1024); + if(false === $data) { + $this->close(); + return false; + } + + if('' === $data) { + if(feof($this->resource)) { + $this->close(); + } + return null; + } + + return $data; + } + + public function write(string $data, $metadata=null) { + if($this->closed) { + return false; + } + fwrite($this->resource, $data); + } + + public function get_metadata() { + return null; + } + + public function is_eof() { + return $this->closed; + } + + public function close() { + if($this->closed) { + return; + } + fclose($this->resource); + $this->closed = true; + } +} + +class FilePipe extends ResourcePipe { + public function __construct($filename, $mode) { + parent::__construct(fopen($filename, $mode)); + } +} + /** * Idea 1: Use multiple pipes to pass multi-band I/O data between processes. */ @@ -169,7 +228,7 @@ public function __construct() } public function add_channel(string $name, $pipe = null) { - $this->channels[$name] = $pipe ?? new UnixPipe(); + $this->channels[$name] = $pipe ?? new BufferingPipe(); } public function read() { @@ -424,8 +483,10 @@ protected function do_tick($tick_context) { } -class ShellCommandsChain extends Process { +class ProcessChain extends Process { public array $process_factories; + private $first_subprocess; + private $last_subprocess; public $subprocesses = []; private $reaped_pids = []; @@ -438,23 +499,29 @@ public function init() { $names = array_keys($this->process_factories); $processes = array_values($this->process_factories); for($i = 0; $i < count($this->process_factories); $i++) { - if(null === $last_process) { - $stdin = $this->stdin; - } else { - $stdin = $last_process->stdout; - } $subprocess = ProcessManager::spawn( $processes[$i], - $stdin, + null !== $last_process ?$last_process->stdout : null, null, $this->stderr ); $this->subprocesses[$names[$i]] = $subprocess; $last_process = $subprocess; } + + $this->first_subprocess = $this->subprocesses[$names[0]]; + $this->last_subprocess = $this->subprocesses[$names[count($this->process_factories) - 1]]; } protected function do_tick($tick_context) { + $data = $this->stdin->read(); + if (null !== $data && false !== $data) { + $this->first_subprocess->stdin->write($data, $this->stdin->get_metadata()); + } + if($this->stdin->is_eof()) { + $this->first_subprocess->stdin->close(); + } + foreach ($this->subprocesses as $name => $process) { if ($process->is_alive()) { $process->tick($tick_context); @@ -480,32 +547,35 @@ protected function do_tick($tick_context) { } } - $data = $process->stdout->read(); + $data = $this->last_subprocess->stdout->read(); if(null !== $data && false !== $data) { $this->stdout->write($data, $tick_context); } - if($process->stdout->is_eof()) { + if($this->last_subprocess->stdout->is_eof()) { $this->kill(0); } } } -$process = ProcessManager::spawn(fn () => new ShellCommandsChain([ - 'http' => fn() => new FakeHttpClient(), - 'uc' => fn() => new Uppercaser(), - // 'upper' => fn() => new Demultiplexer(fn() => new Uppercaser()) -])); +$process = ProcessManager::spawn( + fn () => new ProcessChain([ + // 'http' => fn() => new FakeHttpClient(), + 'uc' => fn() => new Uppercaser(), + // 'upper' => fn() => new Demultiplexer(fn() => new Uppercaser()) + ]) +); +$process->stdin = new FilePipe('./file', 'r'); +$process->stdout = new FilePipe('php://stdout', 'w'); $i = 0; - do { $process->tick([]); - $data = $process->stdout->read(); - if(is_string($data)) { - echo 'Data: ' . $data . "\n"; - } + // $data = $process->stdout->read(); + // if(is_string($data)) { + // echo 'Data: ' . $data . "\n"; + // } $error = $process->stderr->read(); if ($error) { From 14950cd2d3cafa4c4fbe3a96e0ac0968b79f691f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Mon, 22 Jul 2024 20:54:23 +0200 Subject: [PATCH 20/72] Add streaming ZIP reader and the ability to skip files in the upstream streams --- export.wxr.zip | Bin 0 -> 414 bytes pipes-unix.php | 321 +++++++++++++++++++++++++++++++++++--- zip-stream-reader.php | 352 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 651 insertions(+), 22 deletions(-) create mode 100644 export.wxr.zip create mode 100644 zip-stream-reader.php diff --git a/export.wxr.zip b/export.wxr.zip new file mode 100644 index 0000000000000000000000000000000000000000..fc6e7e78f6d3fc8251cd107b3175c43bebc4484b GIT binary patch literal 414 zcmWIWW@Zs#U|`^2XkYRzf~U30Ngv3w17a=)8HUt~g8ZTqz4D5p&=5`r=6TEJr7Z#C z(h6<{MwV}k3=Cksn@{R^p540H^W1rD&$HSaz0P`{^Y`@Kc-rIa$&=RuHO}jw^4xgQ zefrjm?5`g^Vqyq@Tlsn^s+HV8E0go{N>cMm^eS?5z;;@$oRM& E0P{F??f?J) literal 0 HcmV?d00001 diff --git a/pipes-unix.php b/pipes-unix.php index b3e215b..4d0f5b6 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -1,5 +1,8 @@ is_alive()) { return; } - return $this->do_tick($tick_context); + return $this->do_tick($tick_context ?? []); } abstract protected function do_tick($tick_context); @@ -72,12 +75,27 @@ public function cleanup() { // clean up resources } + public function skip_file($file_id) { + // Needs to be implemented by subclasses + return false; + } + protected function set_write_channel(string $name) { $this->stderr->set_channel_for_write($name); $this->stdout->set_channel_for_write($name); } + protected function ensure_output_channel(string $name) + { + if(!$this->stderr->has_channel($name)) { + $this->stderr->add_channel($name); + } + if(!$this->stdout->has_channel($name)) { + $this->stdout->add_channel($name); + } + } + protected function add_output_channel(string $name) { $this->stderr->add_channel($name); @@ -102,7 +120,11 @@ protected function do_tick($tick_context) { if (null === $data || false === $data) { return; } - $this->stdout->write($this->transform($data, $tick_context)); + $transformed = $this->transform($data, $tick_context); + if (null === $transformed || false === $transformed) { + return; + } + $this->stdout->write($transformed, $this->stdin->get_metadata()); } abstract protected function transform($data, $tick_context); @@ -117,11 +139,16 @@ public function close(); public function get_metadata(); } -class BufferingPipe implements Pipe { +class BufferPipe implements Pipe { public ?string $buffer = null; public $metadata = null; private bool $closed = false; + public function __construct($buffer = null) + { + $this->buffer = $buffer; + } + public function read() { $buffer = $this->buffer; if(!$buffer && $this->closed) { @@ -228,7 +255,7 @@ public function __construct() } public function add_channel(string $name, $pipe = null) { - $this->channels[$name] = $pipe ?? new BufferingPipe(); + $this->channels[$name] = $pipe ?? new BufferPipe(); } public function read() { @@ -433,10 +460,23 @@ protected function transform($data, $tick_context) { } } +class CallbackProcess extends TransformProcess { + private $callback; + public function __construct($callback) { + $this->callback = $callback; + } + + protected function transform($data, $tick_context) { + $callback = $this->callback; + return $callback($data, $tick_context, $this); + } +} + class Demultiplexer extends Process { private $process_factory = []; public $subprocesses = []; private $killed_subprocesses = []; + private $last_subprocess = []; public function __construct($process_factory) { $this->process_factory = $process_factory; } @@ -463,6 +503,7 @@ protected function do_tick($tick_context) { $subprocess = $this->subprocesses[$input_channel]; $subprocess->stdin->write( $next_chunk, $this->stdin->get_metadata() ); $subprocess->tick($tick_context); + $this->last_subprocess = $subprocess; $output = $subprocess->stdout->read(); if(null !== $output && false !== $output) { @@ -480,8 +521,103 @@ protected function do_tick($tick_context) { $this->close_output_channel($input_channel); } } + + public function skip_file($file_id) + { + if(!$this->last_subprocess) { + return false; + } + return $this->last_subprocess->skip_file($file_id); + } } +require __DIR__ . '/zip-stream-reader.php'; + +class ZipReaderProcess extends Process { + + private $reader; + private $last_skipped_file = null; + + public function init() { + $this->reader = new ZipStreamReader(''); + } + + public function skip_file($file_id) + { + $this->last_skipped_file = $file_id; + } + + protected function do_tick($tick_context) { + if($this->stdin->is_eof()) { + $this->kill(0); + return; + } + + $bytes = $this->stdin->read(); + if(null === $bytes || false === $bytes) { + return; + } + + $this->reader->append_bytes($bytes); + while ($this->reader->next()) { + switch($this->reader->get_state()) { + case ZipStreamReader::STATE_FILE_ENTRY: + $file_path = $this->reader->get_file_path(); + if($this->last_skipped_file === $file_path) { + break; + } + $this->ensure_output_channel($file_path); + $this->set_write_channel($file_path); + $this->stdout->write($this->reader->get_file_body_chunk(), [ + 'file_id' => $file_path + ]); + break; + } + } + } +} + +class TickContext implements ArrayAccess { + private $data; + private $process; + + public function offsetExists($offset): bool { + $this->get_metadata(); + return isset($this->data[$offset]); + } + + public function offsetGet($offset): mixed { + $this->get_metadata(); + return $this->data[$offset] ?? null; + } + + public function offsetSet($offset, $value): void { + $this->data[$offset] = $value; + } + + public function offsetUnset($offset): void { + unset($this->data[$offset]); + } + + public function __construct($process) + { + $this->process = $process; + } + + public function get_metadata() + { + if(null === $this->data) { + $this->data = $this->process->stdout->get_metadata(); + } + return $this->data; + } + + public function skip_file($file_id) + { + return $this->process->skip_file($file_id); + } + +} class ProcessChain extends Process { public array $process_factories; @@ -528,10 +664,7 @@ protected function do_tick($tick_context) { } if(!$process->stdout->is_eof()) { - $metadata = $process->stdout->get_metadata(); - if (null !== $metadata) { - $tick_context[$name] = $metadata; - } + $tick_context[$name] = new TickContext($process); } if($process->has_crashed()) { @@ -558,25 +691,169 @@ protected function do_tick($tick_context) { } } + +class HttpClientProcess extends Process { + private $client; + private $requests = []; + private $child_contexts = []; + private $skipped_requests = []; + private $errors = []; + + public function __construct( $requests ) { + $this->client = new Client(); + $this->client->enqueue( $requests ); + } + + protected function do_tick($tick_context) + { + if ( ! $this->client->await_next_event() ) { + var_dump('nope'); + $this->kill(0); + return false; + } + + $request = $this->client->get_request(); + $output_channel = 'request_' . $request->id; + $this->ensure_output_channel($output_channel); + + var_dump($this->client->get_event()); + + switch ( $this->client->get_event() ) { + case Client::EVENT_BODY_CHUNK_AVAILABLE: + $this->set_write_channel($output_channel); + $this->stdout->write($this->client->get_response_body_chunk(), [ + 'request' => $request + ]); + break; + case Client::EVENT_FAILED: + $this->stderr->write('Request failed: ' . $request->error, [ + 'request' => $request + ]); + $this->close_output_channel($output_channel); + break; + case Client::EVENT_FINISHED: + $this->close_output_channel($output_channel); + break; + } + } + +} + + +class XMLProcess extends TransformProcess { + private $xml_processor; + private $node_visitor_callback; + + public function __construct( $node_visitor_callback ) { + $this->xml_processor = new WP_XML_Processor( '', [], WP_XML_Processor::IN_PROLOG_CONTEXT ); + $this->node_visitor_callback = $node_visitor_callback; + } + + protected function transform($data, $tick_context) + { + $processor = $this->xml_processor; + if ( $processor->get_last_error() ) { + $this->kill(1); + $this->stderr->write( $processor->get_last_error() ); + return false; + } + + $processor->stream_append_xml( $data ); + + $tokens_found = 0; + while ( $processor->next_token() ) { + ++ $tokens_found; + $node_visitor_callback = $this->node_visitor_callback; + $node_visitor_callback( $processor ); + } + + $buffer = ''; + if ( $tokens_found > 0 ) { + $buffer .= $processor->get_updated_xml(); + } else if ( $tokens_found === 0 || ! $processor->paused_at_incomplete_token() ) { + $buffer .= $processor->get_unprocessed_xml(); + } + + return $buffer; + } + +} + + +function is_wxr_content_node( WP_XML_Processor $processor ) { + if ( ! in_array( 'item', $processor->get_breadcrumbs() ) ) { + return false; + } + if ( + ! in_array( 'excerpt:encoded', $processor->get_breadcrumbs() ) + && ! in_array( 'content:encoded', $processor->get_breadcrumbs() ) + && ! in_array( 'wp:attachment_url', $processor->get_breadcrumbs() ) + && ! in_array( 'guid', $processor->get_breadcrumbs() ) + && ! in_array( 'link', $processor->get_breadcrumbs() ) + && ! in_array( 'wp:comment_content', $processor->get_breadcrumbs() ) + // Meta values are not suppoerted yet. We'll need to support + // WordPress core options that may be saved as JSON, PHP Deserialization, and XML, + // and then provide extension points for plugins authors support + // their own options. + // !in_array('wp:postmeta', $processor->get_breadcrumbs()) + ) { + return false; + } + + switch ( $processor->get_token_type() ) { + case '#text': + case '#cdata-section': + return true; + } + + return false; +}; + +$rewrite_links_in_wxr_node = function (WP_XML_Processor $processor) { + if (is_wxr_content_node($processor)) { + $text = $processor->get_modifiable_text(); + $updated_text = 'Hey there, what\'s up?'; + if ($updated_text !== $text) { + $processor->set_modifiable_text($updated_text); + } + } +}; + +require __DIR__ . '/bootstrap.php'; + + + + + $process = ProcessManager::spawn( - fn () => new ProcessChain([ - // 'http' => fn() => new FakeHttpClient(), - 'uc' => fn() => new Uppercaser(), - // 'upper' => fn() => new Demultiplexer(fn() => new Uppercaser()) + fn() => new ProcessChain([ + 'http' => fn() => new HttpClientProcess([ + new Request('http://127.0.0.1:9864/export.wxr.zip'), + ]), + 'zip' => fn() => new ZipReaderProcess(), + 'skip' => fn() => new CallbackProcess(function ($data, $context, $process) { + if($context['zip']['file_id'] === 'content.xml') { + $context['zip']->skip_file('content.xml'); + return null; + } + return $data; + }), + // 'xml' => fn() => new XMLProcess($rewrite_links_in_wxr_node), + // 'uc' => fn() => new Uppercaser(), + 'xml' => fn() => new Demultiplexer(fn() => new XMLProcess($rewrite_links_in_wxr_node)) ]) ); -$process->stdin = new FilePipe('./file', 'r'); +// $process->stdin = new BufferPipe('hello, world'); +// $process->stdin = new FilePipe('./test.zip', 'r'); $process->stdout = new FilePipe('php://stdout', 'w'); $i = 0; do { - $process->tick([]); - - // $data = $process->stdout->read(); - // if(is_string($data)) { - // echo 'Data: ' . $data . "\n"; - // } + $process->tick(); + log_process_chain_errors($process); +} while ($process->is_alive()); +function log_process_chain_errors($process) { $error = $process->stderr->read(); if ($error) { echo 'Error: ' . $error . "\n"; @@ -587,8 +864,8 @@ protected function do_tick($tick_context) { echo 'CRASH: ' . $meta['process']->stderr->read() . "\n"; } } - } -} while ($process->is_alive()); + } +} // $process->tick([]); // var_dump($process->stdout->read()); diff --git a/zip-stream-reader.php b/zip-stream-reader.php new file mode 100644 index 0000000..893d088 --- /dev/null +++ b/zip-stream-reader.php @@ -0,0 +1,352 @@ +zip = $bytes; + } + + public function append_bytes($bytes) + { + $this->zip = substr($this->zip, $this->bytes_parsed_so_far) . $bytes; + $this->bytes_parsed_so_far = 0; + } + + public function paused_at_incomplete_token() { + return $this->paused_incomplete_input; + } + + public function get_state() + { + return $this->state; + } + + public function get_header() + { + return $this->header; + } + + public function get_file_path() + { + if(!$this->header) { + return null; + } + + return $this->header['path']; + } + + public function get_file_body_chunk() + { + return $this->file_body_chunk; + } + + public function get_error_message() + { + return $this->error_message; + } + + public function next() + { + do { + if(self::STATE_SCAN === $this->state) { + if(false === $this->scan()) { + return false; + } + } + + switch ($this->state) { + case self::STATE_ERROR: + case self::STATE_COMPLETE: + return false; + + case self::STATE_FILE_ENTRY: + if (false === $this->read_file_entry()) { + return false; + } + break; + + case self::STATE_CENTRAL_DIRECTORY_ENTRY: + if (false === $this->read_central_directory_entry()) { + return false; + } + break; + + case self::STATE_END_CENTRAL_DIRECTORY_ENTRY: + if (false === $this->read_end_central_directory_entry()) { + return false; + } + break; + + default: + return false; + } + } while (self::STATE_SCAN === $this->state); + + return true; + } + + private function read_central_directory_entry() + { + if ($this->header && !empty($this->header['path'])) { + $this->header = null; + $this->state = self::STATE_SCAN; + return; + } + + if (!$this->header) { + $data = $this->consume_bytes(42); + if ($data === false) { + $this->paused_incomplete_input = true; + return false; + } + $this->header = unpack( + 'vversionCreated/vversionNeeded/vgeneralPurpose/vcompressionMethod/vlastModifiedTime/vlastModifiedDate/Vcrc/VcompressedSize/VuncompressedSize/vpathLength/vextraLength/vfileCommentLength/vdiskNumber/vinternalAttributes/VexternalAttributes/VfirstByteAt', + $data + ); + } + + if($this->header) { + $n = $this->header['pathLength'] + $this->header['extraLength'] + $this->header['fileCommentLength']; + if (strlen($this->zip) < $this->bytes_parsed_so_far + $n) { + $this->paused_incomplete_input = true; + return false; + } + + $this->header['path'] = $this->consume_bytes($this->header['pathLength']); + $this->header['extra'] = $this->consume_bytes($this->header['extraLength']); + $this->header['fileComment'] = $this->consume_bytes($this->header['fileCommentLength']); + if(!$this->header['path']) { + $this->set_error('Empty path in central directory entry'); + } + } + } + + private function read_end_central_directory_entry() + { + if ($this->header && ( !empty($this->header['comment']) || 0 === $this->header['commentLength'] )) { + $this->header = null; + $this->state = self::STATE_SCAN; + return; + } + + if(!$this->header) { + $data = $this->consume_bytes(18); + if ($data === false) { + $this->paused_incomplete_input = true; + return false; + } + $this->header = unpack( + 'vdiskNumber/vcentralDirectoryStartDisk/vnumberCentralDirectoryRecordsOnThisDisk/vnumberCentralDirectoryRecords/VcentralDirectorySize/VcentralDirectoryOffset/vcommentLength', + $data + ); + } + + if($this->header && empty($this->header['comment']) && $this->header['commentLength'] > 0) { + $comment = $this->consume_bytes($this->header['commentLength']); + if(false === $comment) { + $this->paused_incomplete_input = true; + return false; + } + $this->header['comment'] = $comment; + } + } + + private function scan() { + $signature = $this->consume_bytes(4); + if ($signature === false) { + $this->paused_incomplete_input = true; + return false; + } + $signature = unpack('V', $signature)[1]; + switch($signature) { + case self::SIGNATURE_FILE: + $this->state = self::STATE_FILE_ENTRY; + break; + case self::SIGNATURE_CENTRAL_DIRECTORY: + $this->state = self::STATE_CENTRAL_DIRECTORY_ENTRY; + break; + case self::SIGNATURE_CENTRAL_DIRECTORY_END: + $this->state = self::STATE_END_CENTRAL_DIRECTORY_ENTRY; + break; + default: + $this->set_error('Invalid signature ' . $signature); + return false; + } + } + + /** + * Reads a file entry from a zip file. + * + * The file entry is structured as follows: + * + * ``` + * Offset Bytes Description + * 0 4 Local file header signature = 0x04034b50 (PK♥♦ or "PK\3\4") + * 4 2 Version needed to extract (minimum) + * 6 2 General purpose bit flag + * 8 2 Compression method; e.g. none = 0, DEFLATE = 8 (or "\0x08\0x00") + * 10 2 File last modification time + * 12 2 File last modification date + * 14 4 CRC-32 of uncompressed data + * 18 4 Compressed size (or 0xffffffff for ZIP64) + * 22 4 Uncompressed size (or 0xffffffff for ZIP64) + * 26 2 File name length (n) + * 28 2 Extra field length (m) + * 30 n File name + * 30+n m Extra field + * ``` + * + * @param resource $stream + */ + private function read_file_entry() + { + if (null === $this->header) { + $data = $this->consume_bytes(26); + if ($data === false) { + $this->paused_incomplete_input = true; + return false; + } + $this->header = unpack( + 'vversionNeeded/vgeneralPurpose/vcompressionMethod/vlastModifiedTime/vlastModifiedDate/Vcrc/VcompressedSize/VuncompressedSize/vpathLength/vextraLength', + $data + ); + $this->file_compressed_bytes_read_so_far = 0; + } + + if($this->header && empty($this->header['path'])) { + $n = $this->header['pathLength'] + $this->header['extraLength']; + if(strlen($this->zip) < $this->bytes_parsed_so_far + $n) { + $this->paused_incomplete_input = true; + return false; + } + + $this->header['path'] = $this->consume_bytes($this->header['pathLength']); + $this->header['extra'] = $this->consume_bytes($this->header['extraLength']); + if($this->header['compressionMethod'] === self::COMPRESSION_DEFLATE) { + $this->inflate_handle = inflate_init(ZLIB_ENCODING_RAW); + } + } + + if(false === $this->read_file_entry_body_chunk()) { + return false; + } + } + + private function read_file_entry_body_chunk() { + $this->file_body_chunk = null; + + $file_body_bytes_left = $this->header['compressedSize'] - $this->file_compressed_bytes_read_so_far; + if($file_body_bytes_left === 0) { + $this->header = null; + $this->inflate_handle = null; + $this->file_compressed_bytes_read_so_far = 0; + $this->state = self::STATE_SCAN; + return; + } + + if(strlen($this->zip) === $this->bytes_parsed_so_far) { + $this->paused_incomplete_input = true; + return false; + } + + $chunk_size = min(8096, $file_body_bytes_left); + $compressed_bytes = substr($this->zip, $this->bytes_parsed_so_far, $chunk_size); + $this->bytes_parsed_so_far += strlen($compressed_bytes); + $this->file_compressed_bytes_read_so_far += strlen($compressed_bytes); + + if ($this->header['compressionMethod'] === self::COMPRESSION_DEFLATE) { + $uncompressed_bytes = inflate_add($this->inflate_handle, $compressed_bytes); + if ( $uncompressed_bytes === false || inflate_get_status( $this->inflate_handle ) === false ) { + $this->set_error('Failed to inflate'); + return false; + } + } else { + $uncompressed_bytes = $compressed_bytes; + } + + $this->file_body_chunk = $uncompressed_bytes; + } + + private function set_error($message) { + $this->state = self::STATE_ERROR; + $this->error_message = $message; + $this->paused_incomplete_input = false; + } + + private function consume_bytes($n) { + if(strlen($this->zip) < $this->bytes_parsed_so_far + $n) { + return false; + } + + $bytes = substr($this->zip, $this->bytes_parsed_so_far, $n); + $this->bytes_parsed_so_far += $n; + return $bytes; + } + +} + +if (RUN_ZIP_SMOKE_TEST) { + $fp = fopen('./test.zip', 'r'); + $reader = new ZipStreamReader(fread($fp, 2048)); + while (true) { + while ($reader->next()) { + $header = $reader->get_header(); + echo "Reader state: " . $reader->get_state() . " "; + switch ($reader->get_state()) { + case ZipStreamReader::STATE_FILE_ENTRY: + echo $header['path']; + break; + + case ZipStreamReader::STATE_CENTRAL_DIRECTORY_ENTRY: + echo $header['path']; + break; + + case ZipStreamReader::STATE_END_CENTRAL_DIRECTORY_ENTRY: + echo 'End of central directory'; + break; + + case ZipStreamReader::STATE_COMPLETE: + echo 'Complete'; + break; + } + echo "\n"; + } + if ($reader->paused_at_incomplete_token()) { + if (feof($fp)) { + break; + } + $reader->append_bytes(fread($fp, 1024)); + } + if (ZipStreamReader::STATE_ERROR === $reader->get_state()) { + echo 'Error: ' . $reader->get_error_message() . "\n"; + break; + } + } + fclose($fp); +} From 827b28dcd0aaf02a3dfca3a2b0c5c65f632f646f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Mon, 22 Jul 2024 21:00:12 +0200 Subject: [PATCH 21/72] Make Demultiplexer work with multiple zip files --- pipes-unix.php | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 4d0f5b6..5fb8853 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -124,6 +124,8 @@ protected function do_tick($tick_context) { if (null === $transformed || false === $transformed) { return; } + $this->ensure_output_channel($this->stdin->get_current_channel()); + $this->set_write_channel($this->stdin->get_current_channel()); $this->stdout->write($transformed, $this->stdin->get_metadata()); } @@ -822,9 +824,6 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { require __DIR__ . '/bootstrap.php'; - - - $process = ProcessManager::spawn( fn() => new ProcessChain([ 'http' => fn() => new HttpClientProcess([ @@ -838,9 +837,8 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { } return $data; }), - // 'xml' => fn() => new XMLProcess($rewrite_links_in_wxr_node), - // 'uc' => fn() => new Uppercaser(), - 'xml' => fn() => new Demultiplexer(fn() => new XMLProcess($rewrite_links_in_wxr_node)) + 'xml' => fn() => new Demultiplexer(fn() => new XMLProcess($rewrite_links_in_wxr_node)), + 'uc' => fn() => new Uppercaser(), ]) ); // $process->stdin = new BufferPipe('hello, world'); From fb0a44e15e10b5eef46bda27e07e2adef87b33cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Mon, 22 Jul 2024 21:34:56 +0200 Subject: [PATCH 22/72] Use ::stream methods to declare a pipe --- pipes-unix.php | 48 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 5fb8853..18c622b 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -457,6 +457,9 @@ protected function do_tick($tick_context) { } class Uppercaser extends TransformProcess { + static public function stream() { + return fn() => new static(); + } protected function transform($data, $tick_context) { return strtoupper($data); } @@ -464,7 +467,12 @@ protected function transform($data, $tick_context) { class CallbackProcess extends TransformProcess { private $callback; - public function __construct($callback) { + + static public function stream($callback) { + return fn () => new CallbackProcess($callback); + } + + private function __construct($callback) { $this->callback = $callback; } @@ -540,6 +548,10 @@ class ZipReaderProcess extends Process { private $reader; private $last_skipped_file = null; + static public function stream() { + return fn () => new Demultiplexer(fn() => new ZipReaderProcess()); + } + public function init() { $this->reader = new ZipStreamReader(''); } @@ -635,10 +647,15 @@ public function __construct($process_factories) { public function init() { $last_process = null; $names = array_keys($this->process_factories); + foreach($names as $k => $name) { + $names[$k] = $name . ''; + } + $processes = array_values($this->process_factories); for($i = 0; $i < count($this->process_factories); $i++) { + $factory = $processes[$i]; $subprocess = ProcessManager::spawn( - $processes[$i], + $factory, null !== $last_process ?$last_process->stdout : null, null, $this->stderr @@ -701,7 +718,11 @@ class HttpClientProcess extends Process { private $skipped_requests = []; private $errors = []; - public function __construct( $requests ) { + static public function stream($requests) { + return fn () => new HttpClientProcess($requests); + } + + private function __construct( $requests ) { $this->client = new Client(); $this->client->enqueue( $requests ); } @@ -746,7 +767,13 @@ class XMLProcess extends TransformProcess { private $xml_processor; private $node_visitor_callback; - public function __construct( $node_visitor_callback ) { + static public function stream($node_visitor_callback) { + return fn () => new Demultiplexer(fn () => + new XMLProcess($node_visitor_callback) + ); + } + + private function __construct( $node_visitor_callback ) { $this->xml_processor = new WP_XML_Processor( '', [], WP_XML_Processor::IN_PROLOG_CONTEXT ); $this->node_visitor_callback = $node_visitor_callback; } @@ -826,23 +853,22 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { $process = ProcessManager::spawn( fn() => new ProcessChain([ - 'http' => fn() => new HttpClientProcess([ + HttpClientProcess::stream([ new Request('http://127.0.0.1:9864/export.wxr.zip'), ]), - 'zip' => fn() => new ZipReaderProcess(), - 'skip' => fn() => new CallbackProcess(function ($data, $context, $process) { + + 'zip' => ZipReaderProcess::stream(), + CallbackProcess::stream(function ($data, $context, $process) { if($context['zip']['file_id'] === 'content.xml') { $context['zip']->skip_file('content.xml'); return null; } return $data; }), - 'xml' => fn() => new Demultiplexer(fn() => new XMLProcess($rewrite_links_in_wxr_node)), - 'uc' => fn() => new Uppercaser(), + XMLProcess::stream($rewrite_links_in_wxr_node), + Uppercaser::stream(), ]) ); -// $process->stdin = new BufferPipe('hello, world'); -// $process->stdin = new FilePipe('./test.zip', 'r'); $process->stdout = new FilePipe('php://stdout', 'w'); $i = 0; From 71b9be730a472822b41aee9254a7061a5344decd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Mon, 22 Jul 2024 21:37:49 +0200 Subject: [PATCH 23/72] Accept process instance in ProcessManager::spawn() --- pipes-unix.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 18c622b..1c7d706 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -9,8 +9,8 @@ class ProcessManager { static private $process_table = []; static private $reaped_pids = []; - static public function spawn($factory, $stdin=null, $stdout=null, $stderr=null) { - $process = $factory(); + static public function spawn($factory_or_process, $stdin=null, $stdout=null, $stderr=null) { + $process = $factory_or_process instanceof Process ? $factory_or_process : $factory_or_process(); $process->stdin = $stdin ?? new MultiChannelPipe(); $process->stdout = $stdout ?? new MultiChannelPipe(); $process->stderr = $stderr ?? new MultiChannelPipe(); @@ -852,7 +852,7 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { $process = ProcessManager::spawn( - fn() => new ProcessChain([ + new ProcessChain([ HttpClientProcess::stream([ new Request('http://127.0.0.1:9864/export.wxr.zip'), ]), From a3f93af808f86076a36c6e31fc69fda10b042a52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Mon, 22 Jul 2024 21:40:19 +0200 Subject: [PATCH 24/72] Add $process->run(); method to exhaust the entire stdin stream. Probably a bad idea in a general case as it could get stuck in an infinite loop if the input is async and we need to context switch to load more data. --- pipes-unix.php | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 1c7d706..fbc93c1 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -43,6 +43,15 @@ abstract class Process { public Pipe $stderr; public $pid; + public function run() + { + do { + $this->tick(); + // @TODO: Implement error handling + log_process_chain_errors($this); + } while ($this->is_alive()); + } + public function tick($tick_context=null) { if(!$this->is_alive()) { return; @@ -870,12 +879,7 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { ]) ); $process->stdout = new FilePipe('php://stdout', 'w'); - -$i = 0; -do { - $process->tick(); - log_process_chain_errors($process); -} while ($process->is_alive()); +$process->run(); function log_process_chain_errors($process) { $error = $process->stderr->read(); From c5163363a85ce092f114ef4403e3595fb31dcec6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Mon, 22 Jul 2024 21:55:30 +0200 Subject: [PATCH 25/72] Add a long list of todos --- pipes-unix.php | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/pipes-unix.php b/pipes-unix.php index fbc93c1..937287b 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -1,5 +1,41 @@ is_eof()` and then + * whether `stdin->read()` is valid. Can we simplify this boilerplate somehow? + * * Explore a shared "Streamable" interface for all stream processors (HTML, XML, ZIP, HTTP, etc.) + * * Get rid of ProcessManager + * * Get rid of stderr. We don't need it to be a stream. A single $error field + bubbling should do. + * * Remove these methods: set_write_channel, ensure_output_channel, add_output_channel, close_output_channel + * * Explore semantic updates to metadata: + * * Exposing metadata on a stream instance instead of a pipe + * * Not writing bytes to a pipe but writing a new Chunk($bytes, $metadata) object to tightly couple the two + * * Demultiplexing modes: per input channel, per $metadata['file_id']. + * * Figure out interop Pipe and MultiChannelPipe – they are not interchangeable. Maybe + * we could use metadata to pass the channel name, and the regular pipe would ignore it? + * Maybe a MultiChannelPipe would just have special semantics for that metadata field? + * And it would keep track of eofs etc using a set of internal Pipe instances? + * * Calling get_metadata() without calling read() first returns the last metadata. This + * bit me a few times when I was in a context where I could not call read() first because, + * e.g. another process was about to do that. Maybe this is a good thing, as it forces us + * to split a pipe in two whenever an intermediate read is involved, e.g. Process A wouldn't + * just connect it's stdin to a subprocess A.1, but it would read from stdin, read metadata, + * do processing, ant only then write to A.1 stdin. Still, a better error reporting wouldn't hurt. + * * Declare `bool` return type everywhere where it's missing. We may even remove it later for PHP BC, + * but let's still add it for a moment just to make sure we're not missing any typed return. + * * Should Process::tick() return a boolean? Or is it fine if it doesn't return anything? + * * Pipe::read() returns a string on success, false on failure, or null if there were no writes + * since the last read and we'd just return an empty string. This three-state semantics is useful, + * but it's painful to always check for false and null, and then it may not interop well with + * PHP streams where fread() never returns null. Let's think this through some more. + */ + use WordPress\AsyncHttp\Client; use WordPress\AsyncHttp\Request; From a57c2aaf4c713c886a770218a3c2746446642de3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Mon, 22 Jul 2024 23:52:33 +0200 Subject: [PATCH 26/72] Remove set_write_channel and other similar methods --- pipes-unix.php | 367 +++++++++++++++++++++++-------------------------- 1 file changed, 172 insertions(+), 195 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 937287b..aeeda53 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -11,8 +11,9 @@ * whether `stdin->read()` is valid. Can we simplify this boilerplate somehow? * * Explore a shared "Streamable" interface for all stream processors (HTML, XML, ZIP, HTTP, etc.) * * Get rid of ProcessManager - * * Get rid of stderr. We don't need it to be a stream. A single $error field + bubbling should do. - * * Remove these methods: set_write_channel, ensure_output_channel, add_output_channel, close_output_channel + * * ✅ Get rid of stderr. We don't need it to be a stream. A single $error field + bubbling should do. + * Let's keep stderr after all. + * * ✅ Remove these methods: set_write_channel, ensure_output_channel, add_output_channel, close_output_channel * * Explore semantic updates to metadata: * * Exposing metadata on a stream instance instead of a pipe * * Not writing bytes to a pipe but writing a new Chunk($bytes, $metadata) object to tightly couple the two @@ -36,6 +37,71 @@ * PHP streams where fread() never returns null. Let's think this through some more. */ +/** + * ## Demultiplexing modes: per input channel, per $metadata['file_id']. + * + * We want to keep track of: + * * Stream ID – the sequential byte stream identifier. Multiple streams will produce + * file chunks in an arbitrary order and, when multiplexed, the chunks will be + * interleaved. + * * File ID – the file within that stream. A single stream may contain multiple files, + * but they will always be written sequentially. When multiplexed, one file will + * always be written completely before the next one is started. + * + * When a specific stream errors out, we need to communicate this + * downstream and so the consumer processes can handle the error. + * + * Therefore, we need a separate pipe for each stream ID. Do we also + * need a separate process? Not necessarily. Each process only cares + * about the open-ness or EOF-ness of its input and output pipes, + * not about the actual lifecycle of the other processes. + * + * However, we may want to correlate the same stream ID with stdout and + * stderr streams, in which case intertwining stream ID and process ID + * would be useful. But then we don't have a 1:1 mapping between + * what a data stream does and what a process does. + * + * Let's try these two approach and see where we get with it: + * + * 1. Each process has a multiplexed stdin, stdout, and stderr pipes. + * We do not use non-multiplexed pipes at all. Every process communicates + * "there will be more output to come" by keeping at least one output + * pipe open. Each process makes sure to react to sub-pipe state changes. + * When a read() operation is called and a specific sub-pipe is EOF, + * that process cleans up its sub resources and closes the corresponding + * output sub-pipe. + * 2. Each process has a single input and output pipe. A process + * that produces multiple data stream fakes spawning one child + * process per data stream. The next process gets multiple input + * pipes, but no actual access to the child processes of the first + * process. Then, it may spawn its own child processes. Hm. But that + * just sounds a multi-pipe solution with extra steps. + */ + + +/** + * ## Get rid of stderr. We don't need it to be a stream. A single $error field + bubbling should do. + * + * Maybe stderr is fine after all? I'm no longer convinced about inventing a separate mechanism + * for error propagation. We'd have to implement a lot of the same features that stderr already + * have. + * + * Advantages of using stderr for propagating errors: + * + * * We can bubble up multiple errors from a single process. + * * They have metadata attached and are traceable to a specific process. + * * Piping to stderr doesn't imply the entire process have crashed, which we + * wouldn't want in case of, say, Demultiplexer. + * * We clearly know when the errors are done, as stderr is a stream and we know + * when it's EOF. + * * We can put any pipe in place of stderr, e.g. a generic logger pipe + * + * Disadvantages: + * + * * Pipes have more features than error propagation uses, e.g. we rarely care + * for is_eof() on stderr, but we still have to close that errors pipe. + */ + use WordPress\AsyncHttp\Client; use WordPress\AsyncHttp\Request; @@ -49,7 +115,7 @@ static public function spawn($factory_or_process, $stdin=null, $stdout=null, $st $process = $factory_or_process instanceof Process ? $factory_or_process : $factory_or_process(); $process->stdin = $stdin ?? new MultiChannelPipe(); $process->stdout = $stdout ?? new MultiChannelPipe(); - $process->stderr = $stderr ?? new MultiChannelPipe(); + $process->stderr = $stderr ?? new BufferPipe(); $process->pid = self::$last_pid++; $process->init(); self::$process_table[$process->pid] = $process; @@ -125,33 +191,6 @@ public function skip_file($file_id) { return false; } - protected function set_write_channel(string $name) - { - $this->stderr->set_channel_for_write($name); - $this->stdout->set_channel_for_write($name); - } - - protected function ensure_output_channel(string $name) - { - if(!$this->stderr->has_channel($name)) { - $this->stderr->add_channel($name); - } - if(!$this->stdout->has_channel($name)) { - $this->stdout->add_channel($name); - } - } - - protected function add_output_channel(string $name) - { - $this->stderr->add_channel($name); - $this->stdout->add_channel($name); - } - - protected function close_output_channel(string $name) - { - $this->stderr->close_channel($name); - $this->stdout->close_channel($name); - } } abstract class TransformProcess extends Process { @@ -161,17 +200,21 @@ protected function do_tick($tick_context) { return; } - $data = $this->stdin->read(); - if (null === $data || false === $data) { - return; - } - $transformed = $this->transform($data, $tick_context); - if (null === $transformed || false === $transformed) { - return; + while (true) { + $data = $this->stdin->read(); + if (null === $data || false === $data) { + break; + } + $transformed = $this->transform($data, $tick_context); + if (null === $transformed || false === $transformed) { + break; + } + if (!$this->stdout->has_channel($this->stdin->get_current_channel())) { + $this->stdout->add_channel($this->stdin->get_current_channel()); + } + $this->stdout->set_write_channel($this->stdin->get_current_channel()); + $this->stdout->write($transformed, $this->stdin->get_metadata()); } - $this->ensure_output_channel($this->stdin->get_current_channel()); - $this->set_write_channel($this->stdin->get_current_channel()); - $this->stdout->write($transformed, $this->stdin->get_metadata()); } abstract protected function transform($data, $tick_context); @@ -302,7 +345,11 @@ public function __construct() } public function add_channel(string $name, $pipe = null) { + if(isset($this->channels[$name])) { + return false; + } $this->channels[$name] = $pipe ?? new BufferPipe(); + return true; } public function read() { @@ -359,15 +406,16 @@ public function write(string $data, $metadata = null) { } $this->channels[$this->current_channel]->write($data, $metadata); + return true; } public function close_channel($channel_name) { - $this->channels[$channel_name]->close(); $this->current_channel = null; + return $this->channels[$channel_name]->close(); } - public function set_channel_for_write($name) + public function set_write_channel($name) { $this->current_channel = $name; } @@ -403,103 +451,6 @@ public function close() { } } -/** - * Idea 2: Use multiple child processes for - * - * We want to keep track of: - * * Stream ID – the sequential byte stream identifier. Multiple streams will produce - * file chunks in an arbitrary order and, when multiplexed, the chunks will be - * interleaved. - * * File ID – the file within that stream. A single stream may contain multiple files, - * but they will always be written sequentially. When multiplexed, one file will - * always be written completely before the next one is started. - * - * When a specific stream errors out, we need to communicate this - * downstream and so the consumer processes can handle the error. - * - * Therefore, we need a separate pipe for each stream ID. Do we also - * need a separate process? Not necessarily. Each process only cares - * about the open-ness or EOF-ness of its input and output pipes, - * not about the actual lifecycle of the other processes. - * - * However, we may want to correlate the same stream ID with stdout and - * stderr streams, in which case intertwining stream ID and process ID - * would be useful. But then we don't have a 1:1 mapping between - * what a data stream does and what a process does. - * - * Let's try these two approach and see where we get with it: - * - * 1. Each process has a multiplexed stdin, stdout, and stderr pipes. - * We do not use non-multiplexed pipes at all. Every process communicates - * "there will be more output to come" by keeping at least one output - * pipe open. Each process makes sure to react to sub-pipe state changes. - * When a read() operation is called and a specific sub-pipe is EOF, - * that process cleans up its sub resources and closes the corresponding - * output sub-pipe. - * 2. Each process has a single input and output pipe. A process - * that produces multiple data stream fakes spawning one child - * process per data stream. The next process gets multiple input - * pipes, but no actual access to the child processes of the first - * process. Then, it may spawn its own child processes. Hm. But that - * just sounds a multi-pipe solution with extra steps. - */ -class FakeHttpClient extends Process -{ - protected const SIDE_EFFECTS = true; - - public function init() - { - $this->close_output_channel('default'); - } - - protected function do_tick($tick_context) - { - static $tick_nb = 0; - if (++$tick_nb === 1) { - $this->add_output_channel('stream_1'); - $this->set_write_channel('stream_1'); - $this->stdout->write("stream-1-chunk-1", [ - 'file_id' => 1, - ]); - - $this->add_output_channel('stream_2'); - $this->set_write_channel('stream_2'); - $this->stdout->write("stream-2-chunk-1!", [ - 'file_id' => 2, - ]); - } else if (++$tick_nb === 2) { - $this->set_write_channel('stream_3'); - $this->stdout->write("stream-3-chunk-1!"); - } else { - $this->set_write_channel('stream_1'); - $this->stdout->write("stream-1-chunk-2", [ - 'file_id' => 1, - ]); - $this->stdout->write("stream-1-chunk-3", [ - 'file_id' => 3, - ]); - - $this->add_output_channel('stream_3'); - $this->set_write_channel('stream_3'); - $this->stdout->write("stream-3-chunk-2!", [ - 'file_id' => 2, - ]); - - $this->kill(0); - } - } -} - - -class HelloWorld extends Process { - protected function do_tick($tick_context) { - $this->stdout->write("Hello, world!", [ - 'file_id' => 1, - ]); - $this->stderr->write("Critical error has occured :("); - $this->kill(1); - } -} class Uppercaser extends TransformProcess { static public function stream() { @@ -542,38 +493,43 @@ protected function do_tick($tick_context) { return; } - $next_chunk = $this->stdin->read(); - if(null === $next_chunk || false === $next_chunk) { - return; - } + while (true) { + $next_chunk = $this->stdin->read(); + if (null === $next_chunk || false === $next_chunk) { + break; + } - $input_channel = $this->stdin->get_current_channel(); - if(!isset($this->subprocesses[$input_channel])) { - $this->add_output_channel($input_channel); - $this->subprocesses[$input_channel] = ProcessManager::spawn( - $this->process_factory - ); - } + $input_channel = $this->stdin->get_current_channel(); + if (!isset($this->subprocesses[$input_channel])) { + $this->stdout->add_channel($input_channel); + $this->subprocesses[$input_channel] = ProcessManager::spawn( + $this->process_factory + ); + } - $subprocess = $this->subprocesses[$input_channel]; - $subprocess->stdin->write( $next_chunk, $this->stdin->get_metadata() ); - $subprocess->tick($tick_context); - $this->last_subprocess = $subprocess; + $subprocess = $this->subprocesses[$input_channel]; + $subprocess->stdin->write($next_chunk, $this->stdin->get_metadata()); + $subprocess->tick($tick_context); + $this->last_subprocess = $subprocess; - $output = $subprocess->stdout->read(); - if(null !== $output && false !== $output) { - $this->set_write_channel($input_channel); - $this->stdout->write($output, $subprocess->stdout->get_metadata()); - } + $output = $subprocess->stdout->read(); + if (null !== $output && false !== $output) { + $this->stdout->set_write_channel($input_channel); + $this->stdout->write($output, $subprocess->stdout->get_metadata()); + } - if (!$subprocess->is_alive()) { - if($subprocess->has_crashed()) { - $this->stderr->write("Subprocess $input_channel has crashed with code {$subprocess->exit_code}", [ - 'type' => 'crash', - 'process' => $subprocess, - ]); + if (!$subprocess->is_alive()) { + if ($subprocess->has_crashed()) { + $this->stderr->write( + "Subprocess $input_channel has crashed with code {$subprocess->exit_code}", + [ + 'type' => 'crash', + 'process' => $subprocess, + ] + ); + } + $this->stdout->close_channel($input_channel); } - $this->close_output_channel($input_channel); } } @@ -612,25 +568,29 @@ protected function do_tick($tick_context) { return; } - $bytes = $this->stdin->read(); - if(null === $bytes || false === $bytes) { - return; - } + while (true) { + $bytes = $this->stdin->read(); + if (null === $bytes || false === $bytes) { + break; + } - $this->reader->append_bytes($bytes); - while ($this->reader->next()) { - switch($this->reader->get_state()) { - case ZipStreamReader::STATE_FILE_ENTRY: - $file_path = $this->reader->get_file_path(); - if($this->last_skipped_file === $file_path) { + $this->reader->append_bytes($bytes); + while ($this->reader->next()) { + switch ($this->reader->get_state()) { + case ZipStreamReader::STATE_FILE_ENTRY: + $file_path = $this->reader->get_file_path(); + if ($this->last_skipped_file === $file_path) { + break; + } + if (!$this->stdout->has_channel($file_path)) { + $this->stdout->add_channel($file_path); + } + $this->stdout->set_write_channel($file_path); + $this->stdout->write($this->reader->get_file_body_chunk(), [ + 'file_id' => $file_path + ]); break; - } - $this->ensure_output_channel($file_path); - $this->set_write_channel($file_path); - $this->stdout->write($this->reader->get_file_body_chunk(), [ - 'file_id' => $file_path - ]); - break; + } } } } @@ -701,9 +661,7 @@ public function init() { $factory = $processes[$i]; $subprocess = ProcessManager::spawn( $factory, - null !== $last_process ?$last_process->stdout : null, - null, - $this->stderr + null !== $last_process ?$last_process->stdout : null ); $this->subprocesses[$names[$i]] = $subprocess; $last_process = $subprocess; @@ -741,6 +699,19 @@ protected function do_tick($tick_context) { ]); return; } + continue; + } + + while (true) { + $err = $process->stderr->read(); + if (null === $err || false === $err) { + break; + } + $this->stderr->write($err, [ + 'type' => 'error', + 'process' => $process, + ...$process->stderr->get_metadata(), + ]); } } @@ -775,32 +746,31 @@ private function __construct( $requests ) { protected function do_tick($tick_context) { if ( ! $this->client->await_next_event() ) { - var_dump('nope'); $this->kill(0); return false; } $request = $this->client->get_request(); $output_channel = 'request_' . $request->id; - $this->ensure_output_channel($output_channel); - - var_dump($this->client->get_event()); + if (!$this->stdout->has_channel($output_channel)) { + $this->stdout->add_channel($output_channel); + } + $this->stdout->set_write_channel($output_channel); switch ( $this->client->get_event() ) { case Client::EVENT_BODY_CHUNK_AVAILABLE: - $this->set_write_channel($output_channel); $this->stdout->write($this->client->get_response_body_chunk(), [ 'request' => $request ]); break; - case Client::EVENT_FAILED: + case Client::EVENT_FAILED: $this->stderr->write('Request failed: ' . $request->error, [ 'request' => $request ]); - $this->close_output_channel($output_channel); + $this->stdout->close_channel($output_channel); break; case Client::EVENT_FINISHED: - $this->close_output_channel($output_channel); + $this->stdout->close_channel($output_channel); break; } } @@ -900,6 +870,8 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { new ProcessChain([ HttpClientProcess::stream([ new Request('http://127.0.0.1:9864/export.wxr.zip'), + // Bad request, will fail: + new Request('http://127.0.0.1:9865'), ]), 'zip' => ZipReaderProcess::stream(), @@ -912,12 +884,17 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { }), XMLProcess::stream($rewrite_links_in_wxr_node), Uppercaser::stream(), - ]) + ]), ); $process->stdout = new FilePipe('php://stdout', 'w'); +$process->stderr = new FilePipe('php://stderr', 'w'); $process->run(); function log_process_chain_errors($process) { + if(!($process->stderr instanceof BufferPipe)) { + return; + } + $error = $process->stderr->read(); if ($error) { echo 'Error: ' . $error . "\n"; From e14f7bb59215017e1c338c152031de0fa6e7828e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 23 Jul 2024 00:07:11 +0200 Subject: [PATCH 27/72] Remove ProcessManager --- pipes-unix.php | 130 +++++++++++++++++++++++-------------------------- 1 file changed, 62 insertions(+), 68 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index aeeda53..522e5a4 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -10,7 +10,7 @@ * * The process `do_tick` method typically checks for `stdin->is_eof()` and then * whether `stdin->read()` is valid. Can we simplify this boilerplate somehow? * * Explore a shared "Streamable" interface for all stream processors (HTML, XML, ZIP, HTTP, etc.) - * * Get rid of ProcessManager + * * ✅ Get rid of ProcessManager * * ✅ Get rid of stderr. We don't need it to be a stream. A single $error field + bubbling should do. * Let's keep stderr after all. * * ✅ Remove these methods: set_write_channel, ensure_output_channel, add_output_channel, close_output_channel @@ -105,45 +105,20 @@ use WordPress\AsyncHttp\Client; use WordPress\AsyncHttp\Request; -class ProcessManager { - - static private $last_pid = 1; - static private $process_table = []; - static private $reaped_pids = []; - - static public function spawn($factory_or_process, $stdin=null, $stdout=null, $stderr=null) { - $process = $factory_or_process instanceof Process ? $factory_or_process : $factory_or_process(); - $process->stdin = $stdin ?? new MultiChannelPipe(); - $process->stdout = $stdout ?? new MultiChannelPipe(); - $process->stderr = $stderr ?? new BufferPipe(); - $process->pid = self::$last_pid++; - $process->init(); - self::$process_table[$process->pid] = $process; - return $process; - } - - static public function kill($pid, $code) { - self::$process_table[$pid]->kill($code); - } - - static public function reap($pid) { - self::$reaped_pids[] = $pid; - self::$process_table[$pid]->cleanup(); - unset(self::$process_table[$pid]); - } - - static public function is_reaped($pid) { - return in_array($pid, self::$reaped_pids); - } - -} - abstract class Process { - public ?int $exit_code = null; + private ?int $exit_code = null; + private bool $is_reaped = false; public Pipe $stdin; public Pipe $stdout; public Pipe $stderr; - public $pid; + + public function __construct($stdin=null, $stdout=null, $stderr=null) + { + $this->stdin = $stdin ?? new MultiChannelPipe(); + $this->stdout = $stdout ?? new MultiChannelPipe(); + $this->stderr = $stderr ?? new BufferPipe(); + $this->init(); + } public function run() { @@ -171,6 +146,21 @@ public function kill($code) { $this->stderr->close(); } + public function reap() + { + if($this->is_alive()) { + return false; + } + $this->is_reaped = true; + $this->cleanup(); + return true; + } + + public function is_reaped() + { + return $this->is_reaped; + } + public function has_crashed() { return $this->exit_code !== null && $this->exit_code !== 0; } @@ -179,10 +169,10 @@ public function is_alive() { return $this->exit_code === null; } - public function init() { + protected function init() { } - public function cleanup() { + protected function cleanup() { // clean up resources } @@ -470,6 +460,7 @@ static public function stream($callback) { private function __construct($callback) { $this->callback = $callback; + parent::__construct(); } protected function transform($data, $tick_context) { @@ -485,6 +476,7 @@ class Demultiplexer extends Process { private $last_subprocess = []; public function __construct($process_factory) { $this->process_factory = $process_factory; + parent::__construct(); } protected function do_tick($tick_context) { @@ -502,9 +494,8 @@ protected function do_tick($tick_context) { $input_channel = $this->stdin->get_current_channel(); if (!isset($this->subprocesses[$input_channel])) { $this->stdout->add_channel($input_channel); - $this->subprocesses[$input_channel] = ProcessManager::spawn( - $this->process_factory - ); + $factory = $this->process_factory; + $this->subprocesses[$input_channel] = $factory(); } $subprocess = $this->subprocesses[$input_channel]; @@ -553,7 +544,7 @@ static public function stream() { return fn () => new Demultiplexer(fn() => new ZipReaderProcess()); } - public function init() { + protected function init() { $this->reader = new ZipStreamReader(''); } @@ -647,9 +638,10 @@ class ProcessChain extends Process { public function __construct($process_factories) { $this->process_factories = $process_factories; + parent::__construct(); } - public function init() { + protected function init() { $last_process = null; $names = array_keys($this->process_factories); foreach($names as $k => $name) { @@ -659,10 +651,10 @@ public function init() { $processes = array_values($this->process_factories); for($i = 0; $i < count($this->process_factories); $i++) { $factory = $processes[$i]; - $subprocess = ProcessManager::spawn( - $factory, - null !== $last_process ?$last_process->stdout : null - ); + $subprocess = $factory(); + if(null !== $last_process) { + $subprocess->stdin = $last_process->stdout; + } $this->subprocesses[$names[$i]] = $subprocess; $last_process = $subprocess; } @@ -690,8 +682,8 @@ protected function do_tick($tick_context) { } if($process->has_crashed()) { - if (!ProcessManager::is_reaped($process->pid)) { - ProcessManager::reap($process->pid); + if (!$process->is_reaped()) { + $process->reap(); $this->stderr->write("Process $name has crashed with code {$process->exit_code}", [ 'type' => 'crash', 'process' => $process, @@ -741,6 +733,8 @@ static public function stream($requests) { private function __construct( $requests ) { $this->client = new Client(); $this->client->enqueue( $requests ); + + parent::__construct(); } protected function do_tick($tick_context) @@ -791,6 +785,7 @@ static public function stream($node_visitor_callback) { private function __construct( $node_visitor_callback ) { $this->xml_processor = new WP_XML_Processor( '', [], WP_XML_Processor::IN_PROLOG_CONTEXT ); $this->node_visitor_callback = $node_visitor_callback; + parent::__construct(); } protected function transform($data, $tick_context) @@ -866,31 +861,30 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { require __DIR__ . '/bootstrap.php'; -$process = ProcessManager::spawn( - new ProcessChain([ - HttpClientProcess::stream([ - new Request('http://127.0.0.1:9864/export.wxr.zip'), - // Bad request, will fail: - new Request('http://127.0.0.1:9865'), - ]), - - 'zip' => ZipReaderProcess::stream(), - CallbackProcess::stream(function ($data, $context, $process) { - if($context['zip']['file_id'] === 'content.xml') { - $context['zip']->skip_file('content.xml'); - return null; - } - return $data; - }), - XMLProcess::stream($rewrite_links_in_wxr_node), - Uppercaser::stream(), +$process = new ProcessChain([ + HttpClientProcess::stream([ + new Request('http://127.0.0.1:9864/export.wxr.zip'), + // Bad request, will fail: + new Request('http://127.0.0.1:9865'), ]), -); + + 'zip' => ZipReaderProcess::stream(), + CallbackProcess::stream(function ($data, $context, $process) { + if ($context['zip']['file_id'] === 'content.xml') { + $context['zip']->skip_file('content.xml'); + return null; + } + return $data; + }), + XMLProcess::stream($rewrite_links_in_wxr_node), + Uppercaser::stream(), +]); $process->stdout = new FilePipe('php://stdout', 'w'); $process->stderr = new FilePipe('php://stderr', 'w'); $process->run(); function log_process_chain_errors($process) { + return; if(!($process->stderr instanceof BufferPipe)) { return; } From b8804f495d05cf04b02b80eb930fb4d25506b2b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 23 Jul 2024 00:21:57 +0200 Subject: [PATCH 28/72] =?UTF-8?q?Simplify=20channel=20management=20?= =?UTF-8?q?=E2=80=93=20use=20the=20'channel'=20metadata=20parameter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipes-unix.php | 96 +++++++++++++++++++++++--------------------------- 1 file changed, 44 insertions(+), 52 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 522e5a4..c000677 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -117,7 +117,6 @@ public function __construct($stdin=null, $stdout=null, $stderr=null) $this->stdin = $stdin ?? new MultiChannelPipe(); $this->stdout = $stdout ?? new MultiChannelPipe(); $this->stderr = $stderr ?? new BufferPipe(); - $this->init(); } public function run() @@ -169,9 +168,6 @@ public function is_alive() { return $this->exit_code === null; } - protected function init() { - } - protected function cleanup() { // clean up resources } @@ -199,10 +195,6 @@ protected function do_tick($tick_context) { if (null === $transformed || false === $transformed) { break; } - if (!$this->stdout->has_channel($this->stdin->get_current_channel())) { - $this->stdout->add_channel($this->stdin->get_current_channel()); - } - $this->stdout->set_write_channel($this->stdin->get_current_channel()); $this->stdout->write($transformed, $this->stdin->get_metadata()); } } @@ -327,7 +319,6 @@ class MultiChannelPipe implements Pipe { public $metadata; private array $channels = []; private ?string $last_read_channel = 'default'; - private ?string $current_channel = 'default'; public function __construct() { @@ -354,7 +345,7 @@ public function read() { if ($data === false || $data === null) { continue; } - $this->last_read_channel = $this->current_channel = $channel_name; + $this->last_read_channel = $channel_name; $this->metadata = $this->channels[$channel_name]->get_metadata(); return $data; } @@ -391,33 +382,33 @@ public function get_metadata() { } public function write(string $data, $metadata = null) { - if (!isset($this->channels[$this->current_channel])) { - return false; - } + $current_channel = 'default'; - $this->channels[$this->current_channel]->write($data, $metadata); - return true; - } + if(is_array($metadata) && isset($metadata['channel'])) { + $current_channel = $metadata['channel']; + } - public function close_channel($channel_name) - { - $this->current_channel = null; - return $this->channels[$channel_name]->close(); - } + if (!isset($this->channels[$current_channel])) { + $this->channels[$current_channel] = new BufferPipe(); + } - public function set_write_channel($name) - { - $this->current_channel = $name; + return $this->channels[$current_channel]->write($data, $metadata); } - public function has_channel($name) + public function ensure_channel($channel_name) { - return isset($this->channels[$name]); + if (isset($this->channels[$channel_name])) { + return false; + } + $this->channels[$channel_name] = new BufferPipe(); } - public function get_current_channel() + public function close_channel($channel_name) { - return $this->current_channel; + if (!isset($this->channels[$channel_name])) { + return false; + } + return $this->channels[$channel_name]->close(); } public function get_channel_pipe($index) @@ -491,9 +482,9 @@ protected function do_tick($tick_context) { break; } - $input_channel = $this->stdin->get_current_channel(); + $metadata = $this->stdin->get_metadata(); + $input_channel = is_array($metadata) && !empty( $metadata['channel'] ) ? $metadata['channel'] : 'default'; if (!isset($this->subprocesses[$input_channel])) { - $this->stdout->add_channel($input_channel); $factory = $this->process_factory; $this->subprocesses[$input_channel] = $factory(); } @@ -505,8 +496,10 @@ protected function do_tick($tick_context) { $output = $subprocess->stdout->read(); if (null !== $output && false !== $output) { - $this->stdout->set_write_channel($input_channel); - $this->stdout->write($output, $subprocess->stdout->get_metadata()); + $this->stdout->write($output, array_merge( + $subprocess->stdout->get_metadata() ?? [], + ['channel' => $input_channel] + )); } if (!$subprocess->is_alive()) { @@ -544,7 +537,8 @@ static public function stream() { return fn () => new Demultiplexer(fn() => new ZipReaderProcess()); } - protected function init() { + protected function __construct() { + parent::__construct(); $this->reader = new ZipStreamReader(''); } @@ -573,12 +567,9 @@ protected function do_tick($tick_context) { if ($this->last_skipped_file === $file_path) { break; } - if (!$this->stdout->has_channel($file_path)) { - $this->stdout->add_channel($file_path); - } - $this->stdout->set_write_channel($file_path); $this->stdout->write($this->reader->get_file_body_chunk(), [ - 'file_id' => $file_path + 'file_id' => $file_path, + 'channel' => $file_path, ]); break; } @@ -630,26 +621,22 @@ public function skip_file($file_id) } class ProcessChain extends Process { - public array $process_factories; private $first_subprocess; private $last_subprocess; public $subprocesses = []; private $reaped_pids = []; public function __construct($process_factories) { - $this->process_factories = $process_factories; parent::__construct(); - } - protected function init() { $last_process = null; - $names = array_keys($this->process_factories); + $names = array_keys($process_factories); foreach($names as $k => $name) { $names[$k] = $name . ''; } - $processes = array_values($this->process_factories); - for($i = 0; $i < count($this->process_factories); $i++) { + $processes = array_values($process_factories); + for($i = 0; $i < count($process_factories); $i++) { $factory = $processes[$i]; $subprocess = $factory(); if(null !== $last_process) { @@ -660,7 +647,7 @@ protected function init() { } $this->first_subprocess = $this->subprocesses[$names[0]]; - $this->last_subprocess = $this->subprocesses[$names[count($this->process_factories) - 1]]; + $this->last_subprocess = $this->subprocesses[$names[count($process_factories) - 1]]; } protected function do_tick($tick_context) { @@ -734,7 +721,16 @@ private function __construct( $requests ) { $this->client = new Client(); $this->client->enqueue( $requests ); - parent::__construct(); + parent::__construct(); + + // Pre-open all output channels to ensure the stdout stream + // stays open until all the requests conclude. Otherwise, + // we could have a window of time when some requests are done, + // others haven't started outputting yet, and the stdout stream + // is considered EOF. + foreach($requests as $request) { + $this->stdout->ensure_channel('request_' . $request->id); + } } protected function do_tick($tick_context) @@ -746,14 +742,10 @@ protected function do_tick($tick_context) $request = $this->client->get_request(); $output_channel = 'request_' . $request->id; - if (!$this->stdout->has_channel($output_channel)) { - $this->stdout->add_channel($output_channel); - } - $this->stdout->set_write_channel($output_channel); - switch ( $this->client->get_event() ) { case Client::EVENT_BODY_CHUNK_AVAILABLE: $this->stdout->write($this->client->get_response_body_chunk(), [ + 'channel' => $output_channel, 'request' => $request ]); break; From fb518072d20362a47ac4d42be31074d822be31e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 23 Jul 2024 01:19:34 +0200 Subject: [PATCH 29/72] Process the entire stdin and stdout at each stage --- pipes-unix.php | 91 ++++++++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 39 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index c000677..b5f3c05 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -317,14 +317,10 @@ public function __construct($filename, $mode) { */ class MultiChannelPipe implements Pipe { public $metadata; + private $used = false; private array $channels = []; private ?string $last_read_channel = 'default'; - public function __construct() - { - $this->add_channel('default'); - } - public function add_channel(string $name, $pipe = null) { if(isset($this->channels[$name])) { return false; @@ -382,6 +378,7 @@ public function get_metadata() { } public function write(string $data, $metadata = null) { + $this->used = true; $current_channel = 'default'; if(is_array($metadata) && isset($metadata['channel'])) { @@ -403,6 +400,14 @@ public function ensure_channel($channel_name) $this->channels[$channel_name] = new BufferPipe(); } + public function is_channel_eof($channel_name) + { + if (!isset($this->channels[$channel_name])) { + return false; + } + return $this->channels[$channel_name]->is_eof(); + } + public function close_channel($channel_name) { if (!isset($this->channels[$channel_name])) { @@ -417,6 +422,9 @@ public function get_channel_pipe($index) } public function is_eof() { + if(!$this->used) { + return false; + } foreach ($this->channels as $pipe) { if (!$pipe->is_eof()) { return false; @@ -426,6 +434,7 @@ public function is_eof() { } public function close() { + $this->used = true; foreach ($this->channels as $pipe) { $pipe->close(); } @@ -479,7 +488,7 @@ protected function do_tick($tick_context) { while (true) { $next_chunk = $this->stdin->read(); if (null === $next_chunk || false === $next_chunk) { - break; + return; } $metadata = $this->stdin->get_metadata(); @@ -490,16 +499,23 @@ protected function do_tick($tick_context) { } $subprocess = $this->subprocesses[$input_channel]; - $subprocess->stdin->write($next_chunk, $this->stdin->get_metadata()); + $subprocess->stdin->write($next_chunk, $metadata); $subprocess->tick($tick_context); $this->last_subprocess = $subprocess; - $output = $subprocess->stdout->read(); - if (null !== $output && false !== $output) { - $this->stdout->write($output, array_merge( + while (true) { + $output = $subprocess->stdout->read(); + if (null === $output || false === $output) { + break; + } + $chunk_metadata = array_merge( + ['channel' => $input_channel], $subprocess->stdout->get_metadata() ?? [], - ['channel' => $input_channel] - )); + ); + $this->stdout->write($output, $chunk_metadata); + if($subprocess->stdout->is_channel_eof($chunk_metadata['channel'])) { + $this->stdout->close_channel($chunk_metadata['channel']); + } } if (!$subprocess->is_alive()) { @@ -512,7 +528,6 @@ protected function do_tick($tick_context) { ] ); } - $this->stdout->close_channel($input_channel); } } } @@ -559,6 +574,9 @@ protected function do_tick($tick_context) { break; } + $input_metadata = $this->stdin->get_metadata(); + $input_channel = is_array($input_metadata) && !empty($input_metadata['channel']) ? $input_metadata['channel'] : 'default'; + $this->reader->append_bytes($bytes); while ($this->reader->next()) { switch ($this->reader->get_state()) { @@ -569,7 +587,7 @@ protected function do_tick($tick_context) { } $this->stdout->write($this->reader->get_file_body_chunk(), [ 'file_id' => $file_path, - 'channel' => $file_path, + 'channel' => $file_path //$input_channel, ]); break; } @@ -651,10 +669,14 @@ public function __construct($process_factories) { } protected function do_tick($tick_context) { - $data = $this->stdin->read(); - if (null !== $data && false !== $data) { + while(true) { + $data = $this->stdin->read(); + if (null === $data || false === $data) { + break; + } $this->first_subprocess->stdin->write($data, $this->stdin->get_metadata()); } + if($this->stdin->is_eof()) { $this->first_subprocess->stdin->close(); } @@ -694,8 +716,11 @@ protected function do_tick($tick_context) { } } - $data = $this->last_subprocess->stdout->read(); - if(null !== $data && false !== $data) { + while (true) { + $data = $this->last_subprocess->stdout->read(); + if (null === $data || false === $data) { + break; + } $this->stdout->write($data, $tick_context); } @@ -857,18 +882,18 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { HttpClientProcess::stream([ new Request('http://127.0.0.1:9864/export.wxr.zip'), // Bad request, will fail: - new Request('http://127.0.0.1:9865'), + // new Request('http://127.0.0.1:9865'), ]), 'zip' => ZipReaderProcess::stream(), - CallbackProcess::stream(function ($data, $context, $process) { - if ($context['zip']['file_id'] === 'content.xml') { - $context['zip']->skip_file('content.xml'); - return null; - } - return $data; - }), - XMLProcess::stream($rewrite_links_in_wxr_node), + // CallbackProcess::stream(function ($data, $context, $process) { + // if ($context['zip']['file_id'] === 'content.xml') { + // $context['zip']->skip_file('content.xml'); + // return null; + // } + // return $data; + // }), + 'xml' => XMLProcess::stream($rewrite_links_in_wxr_node), Uppercaser::stream(), ]); $process->stdout = new FilePipe('php://stdout', 'w'); @@ -893,15 +918,3 @@ function log_process_chain_errors($process) { } } } - -// $process->tick([]); -// var_dump($process->stdout->read()); - -// var_dump($process->stdout->get_metadata()); -// $process->tick([]); -// var_dump($process->stdout->get_metadata()); -// var_dump($process->stderr->read()); -// $process->tick([]); -// echo $process->stdout->read(); -// var_dump($process->stdout->is_eof()); -// var_dump($process->is_alive()); \ No newline at end of file From cc80840adbcafffe440d8aee34f571527f1e0b61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 23 Jul 2024 01:26:20 +0200 Subject: [PATCH 30/72] Use fresh stream metadata on each tick --- pipes-unix.php | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index b5f3c05..557d890 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -625,9 +625,7 @@ public function __construct($process) public function get_metadata() { - if(null === $this->data) { - $this->data = $this->process->stdout->get_metadata(); - } + $this->data = $this->process->stdout->get_metadata(); return $this->data; } @@ -882,17 +880,17 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { HttpClientProcess::stream([ new Request('http://127.0.0.1:9864/export.wxr.zip'), // Bad request, will fail: - // new Request('http://127.0.0.1:9865'), + new Request('http://127.0.0.1:9865'), ]), 'zip' => ZipReaderProcess::stream(), - // CallbackProcess::stream(function ($data, $context, $process) { - // if ($context['zip']['file_id'] === 'content.xml') { - // $context['zip']->skip_file('content.xml'); - // return null; - // } - // return $data; - // }), + CallbackProcess::stream(function ($data, $context, $process) { + if ($context['zip']['file_id'] === 'export.wxr') { + $context['zip']->skip_file('export.wxr'); + return null; + } + return $data; + }), 'xml' => XMLProcess::stream($rewrite_links_in_wxr_node), Uppercaser::stream(), ]); From ba472440be7e90b5a2ee1b1053b973cf3a7d5348 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 23 Jul 2024 01:27:26 +0200 Subject: [PATCH 31/72] MultiChannelPipe: overwrite metadata on write --- pipes-unix.php | 1 + 1 file changed, 1 insertion(+) diff --git a/pipes-unix.php b/pipes-unix.php index 557d890..0ef1724 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -389,6 +389,7 @@ public function write(string $data, $metadata = null) { $this->channels[$current_channel] = new BufferPipe(); } + $this->metadata = $metadata; return $this->channels[$current_channel]->write($data, $metadata); } From bf1145e97f837d75b5e129a582f83da23d202b95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 23 Jul 2024 01:32:54 +0200 Subject: [PATCH 32/72] Document the channel_id choice used in ZipReaderProcess --- pipes-unix.php | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pipes-unix.php b/pipes-unix.php index 0ef1724..f6f8962 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -588,7 +588,21 @@ protected function do_tick($tick_context) { } $this->stdout->write($this->reader->get_file_body_chunk(), [ 'file_id' => $file_path, - 'channel' => $file_path //$input_channel, + // We don't want any single chunk to contain mixed bytes from + // multiple files. + // + // Therefore, we must either: + // + // * Use a separate channel for each file to have distinct + // buckets that don't mix. + // * Use a single channel and ensure the unzipped file is fully + // written and consumed before we start writing the next file. + // + // The second option requires more implementation complexity and also + // requires checking whether the output pipe has been read completely + // which is very specific to a BufferPipe. The first option seems simpler + // so let's go with that. + 'channel' => $file_path, ]); break; } From 9e22a230db741a280cdc16ba39490402891f910d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 23 Jul 2024 11:54:27 +0200 Subject: [PATCH 33/72] Process all downstream chunks before pulling more upstream chunks --- pipes-unix.php | 472 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 309 insertions(+), 163 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index f6f8962..5b364a5 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -15,8 +15,13 @@ * Let's keep stderr after all. * * ✅ Remove these methods: set_write_channel, ensure_output_channel, add_output_channel, close_output_channel * * Explore semantic updates to metadata: - * * Exposing metadata on a stream instance instead of a pipe + * * Exposing metadata on a stream instance instead of a pipe. + * ^ With the new "execution stack" model, this seems like a great approach. + * $context['zip'] wouldn't be an abstract metadata array, but the actual ZipStreamReader instance + * with all the methods and properties available. * * Not writing bytes to a pipe but writing a new Chunk($bytes, $metadata) object to tightly couple the two + * ^ the problem with this is that methods like `skip_file()` affect the currently processed file and we + * must call them at the right time * * Demultiplexing modes: per input channel, per $metadata['file_id']. * * Figure out interop Pipe and MultiChannelPipe – they are not interchangeable. Maybe * we could use metadata to pass the channel name, and the regular pipe would ignore it? @@ -30,7 +35,9 @@ * do processing, ant only then write to A.1 stdin. Still, a better error reporting wouldn't hurt. * * Declare `bool` return type everywhere where it's missing. We may even remove it later for PHP BC, * but let's still add it for a moment just to make sure we're not missing any typed return. - * * Should Process::tick() return a boolean? Or is it fine if it doesn't return anything? + * * ✅ Should Process::tick() return a boolean? Or is it fine if it doesn't return anything? + * It now returns either "true", which means "I've produced output", or "false", which means + * "I haven't produced output". * * Pipe::read() returns a string on success, false on failure, or null if there were no writes * since the last read and we'd just return an empty string. This three-state semantics is useful, * but it's painful to always check for false and null, and then it may not interop well with @@ -123,14 +130,12 @@ public function run() { do { $this->tick(); - // @TODO: Implement error handling - log_process_chain_errors($this); } while ($this->is_alive()); } public function tick($tick_context=null) { if(!$this->is_alive()) { - return; + return false; } return $this->do_tick($tick_context ?? []); @@ -183,20 +188,21 @@ abstract class TransformProcess extends Process { protected function do_tick($tick_context) { if($this->stdin->is_eof()) { $this->kill(0); - return; + return false; } - while (true) { - $data = $this->stdin->read(); - if (null === $data || false === $data) { - break; - } - $transformed = $this->transform($data, $tick_context); - if (null === $transformed || false === $transformed) { - break; - } - $this->stdout->write($transformed, $this->stdin->get_metadata()); + $data = $this->stdin->read(); + if (null === $data || false === $data) { + return false; } + + $transformed = $this->transform($data, $tick_context); + if (null === $transformed || false === $transformed) { + return false; + } + + $this->stdout->write($transformed, $this->stdin->get_metadata()); + return true; } abstract protected function transform($data, $tick_context); @@ -474,63 +480,82 @@ class Demultiplexer extends Process { private $process_factory = []; public $subprocesses = []; private $killed_subprocesses = []; - private $last_subprocess = []; + private $demux_queue = []; + private $last_subprocess; + private $last_input_channel; + public function __construct($process_factory) { $this->process_factory = $process_factory; parent::__construct(); } protected function do_tick($tick_context) { - if($this->stdin->is_eof()) { + if(true === $this->tick_last_subprocess()) { + return true; + } + + if($this->stdin->is_eof() || $this->stdout->is_eof()) { $this->kill(0); - return; + return false; } - while (true) { - $next_chunk = $this->stdin->read(); - if (null === $next_chunk || false === $next_chunk) { - return; - } + $next_chunk = $this->stdin->read(); + if (null === $next_chunk || false === $next_chunk) { + return false; + } - $metadata = $this->stdin->get_metadata(); - $input_channel = is_array($metadata) && !empty( $metadata['channel'] ) ? $metadata['channel'] : 'default'; - if (!isset($this->subprocesses[$input_channel])) { - $factory = $this->process_factory; - $this->subprocesses[$input_channel] = $factory(); - } + $metadata = $this->stdin->get_metadata(); + $input_channel = is_array($metadata) && !empty( $metadata['channel'] ) ? $metadata['channel'] : 'default'; + $this->last_input_channel = $input_channel; + if (!isset($this->subprocesses[$input_channel])) { + $factory = $this->process_factory; + $this->subprocesses[$input_channel] = $factory(); + } - $subprocess = $this->subprocesses[$input_channel]; - $subprocess->stdin->write($next_chunk, $metadata); - $subprocess->tick($tick_context); - $this->last_subprocess = $subprocess; + $subprocess = $this->subprocesses[$input_channel]; + $subprocess->stdin->write($next_chunk, $metadata); + $this->last_subprocess = $subprocess; - while (true) { - $output = $subprocess->stdout->read(); - if (null === $output || false === $output) { - break; - } - $chunk_metadata = array_merge( - ['channel' => $input_channel], - $subprocess->stdout->get_metadata() ?? [], - ); - $this->stdout->write($output, $chunk_metadata); - if($subprocess->stdout->is_channel_eof($chunk_metadata['channel'])) { - $this->stdout->close_channel($chunk_metadata['channel']); - } + return $this->tick_last_subprocess(); + } + + private function tick_last_subprocess() + { + $subprocess = $this->last_subprocess; + if(!$subprocess) { + return false; + } + + if(false === $subprocess->tick()) { + return false; + } + + $output = $subprocess->stdout->read(); + if (null !== $output && false !== $output) { + $chunk_metadata = array_merge( + ['channel' => $this->last_input_channel], + $subprocess->stdout->get_metadata() ?? [], + ); + $this->stdout->write($output, $chunk_metadata); + if ($subprocess->stdout->is_channel_eof($chunk_metadata['channel'])) { + $this->stdout->close_channel($chunk_metadata['channel']); } + return true; + } - if (!$subprocess->is_alive()) { - if ($subprocess->has_crashed()) { - $this->stderr->write( - "Subprocess $input_channel has crashed with code {$subprocess->exit_code}", - [ - 'type' => 'crash', - 'process' => $subprocess, - ] - ); - } + if (!$subprocess->is_alive()) { + if ($subprocess->has_crashed()) { + $this->stderr->write( + "Subprocess $this->last_input_channel has crashed with code {$subprocess->exit_code}", + [ + 'type' => 'crash', + 'process' => $subprocess, + ] + ); } } + + return false; } public function skip_file($file_id) @@ -564,56 +589,62 @@ public function skip_file($file_id) } protected function do_tick($tick_context) { + if(true === $this->process_buffered_data()) { + return true; + } + if($this->stdin->is_eof()) { $this->kill(0); - return; + return false; } - while (true) { - $bytes = $this->stdin->read(); - if (null === $bytes || false === $bytes) { - break; - } + $bytes = $this->stdin->read(); + if (null === $bytes || false === $bytes) { + return false; + } - $input_metadata = $this->stdin->get_metadata(); - $input_channel = is_array($input_metadata) && !empty($input_metadata['channel']) ? $input_metadata['channel'] : 'default'; - - $this->reader->append_bytes($bytes); - while ($this->reader->next()) { - switch ($this->reader->get_state()) { - case ZipStreamReader::STATE_FILE_ENTRY: - $file_path = $this->reader->get_file_path(); - if ($this->last_skipped_file === $file_path) { - break; - } - $this->stdout->write($this->reader->get_file_body_chunk(), [ - 'file_id' => $file_path, - // We don't want any single chunk to contain mixed bytes from - // multiple files. - // - // Therefore, we must either: - // - // * Use a separate channel for each file to have distinct - // buckets that don't mix. - // * Use a single channel and ensure the unzipped file is fully - // written and consumed before we start writing the next file. - // - // The second option requires more implementation complexity and also - // requires checking whether the output pipe has been read completely - // which is very specific to a BufferPipe. The first option seems simpler - // so let's go with that. - 'channel' => $file_path, - ]); - break; - } + $this->reader->append_bytes($bytes); + return $this->process_buffered_data(); + } + + protected function process_buffered_data() + { + while ($this->reader->next()) { + switch ($this->reader->get_state()) { + case ZipStreamReader::STATE_FILE_ENTRY: + $file_path = $this->reader->get_file_path(); + if ($this->last_skipped_file === $file_path) { + // break; + } + $this->stdout->write($this->reader->get_file_body_chunk(), [ + 'file_id' => $file_path, + // We don't want any single chunk to contain mixed bytes from + // multiple files. + // + // Therefore, we must either: + // + // * Use a separate channel for each file to have distinct + // buckets that don't mix. + // * Use a single channel and ensure the unzipped file is fully + // written and consumed before we start writing the next file. + // + // The second option requires more implementation complexity and also + // requires checking whether the output pipe has been read completely + // which is very specific to a BufferPipe. The first option seems simpler + // so let's go with that. + 'channel' => $file_path, + ]); + return true; } } + + return false; } } class TickContext implements ArrayAccess { private $data; - private $process; + public $process; public function offsetExists($offset): bool { $this->get_metadata(); @@ -655,15 +686,18 @@ class ProcessChain extends Process { private $first_subprocess; private $last_subprocess; public $subprocesses = []; + public $subprocesses_names = []; private $reaped_pids = []; + private $execution_stack = []; + private $tick_context = []; public function __construct($process_factories) { parent::__construct(); $last_process = null; - $names = array_keys($process_factories); - foreach($names as $k => $name) { - $names[$k] = $name . ''; + $this->subprocesses_names = array_keys($process_factories); + foreach($this->subprocesses_names as $k => $name) { + $this->subprocesses_names[$k] = $name . ''; } $processes = array_values($process_factories); @@ -673,15 +707,44 @@ public function __construct($process_factories) { if(null !== $last_process) { $subprocess->stdin = $last_process->stdout; } - $this->subprocesses[$names[$i]] = $subprocess; + $this->subprocesses[$this->subprocesses_names[$i]] = $subprocess; $last_process = $subprocess; } - $this->first_subprocess = $this->subprocesses[$names[0]]; - $this->last_subprocess = $this->subprocesses[$names[count($process_factories) - 1]]; + $this->first_subprocess = $this->subprocesses[$this->subprocesses_names[0]]; + $this->last_subprocess = $this->subprocesses[$this->subprocesses_names[count($process_factories) - 1]]; } + /** + * ## Process chain tick + * + * Pushes data through a chain of subprocesses. Every downstream data chunk + * is fully processed before asking for more chunks upstream. + * + * For example, suppose we: + * + * * Send 3 HTTP requests, and each of them produces a ZIP file + * * Each ZIP file has 3 XML files inside + * * Each XML file is rewritten using the XML_Processor + * + * Once the HTTP client has produced the first ZIP file, we start processing it. + * The ZIP decoder may already have enough data to unzip three files, but we only + * produce the first chunk of the first file and pass it to the XML processor. + * Then we handle the second chunk of the first file, and so on, until the first + * file is fully processed. Only then we move to the second file. + * + * Then, once the ZIP decoder exhausted the data for the first ZIP file, we move + * to the second ZIP file, and so on. + * + * This way we can maintain a predictable $context variable that carries upstream + * metadata and exposes methods like skip_file(). + */ protected function do_tick($tick_context) { + if($this->last_subprocess->stdout->is_eof()) { + $this->kill(0); + return false; + } + while(true) { $data = $this->stdin->read(); if (null === $data || false === $data) { @@ -694,28 +757,78 @@ protected function do_tick($tick_context) { $this->first_subprocess->stdin->close(); } - foreach ($this->subprocesses as $name => $process) { - if ($process->is_alive()) { - $process->tick($tick_context); + if(empty($this->execution_stack)) { + array_push($this->execution_stack, $this->first_subprocess); + } + + while (count($this->execution_stack)) { + // Unpeel the context stack until we find a process that + // produces output. + $process = $this->pop_process(); + if ($process->stdout->is_eof()) { + continue; } - if(!$process->stdout->is_eof()) { - $tick_context[$name] = new TickContext($process); + if(true !== $this->tick_subprocess($process)) { + continue; } - if($process->has_crashed()) { - if (!$process->is_reaped()) { - $process->reap(); - $this->stderr->write("Process $name has crashed with code {$process->exit_code}", [ - 'type' => 'crash', - 'process' => $process, - 'reaped' => true, - ]); - return; + // We've got output from the process, yay! Let's + // propagate it downstream. + $this->push_process($process); + + for ($i = count($this->execution_stack); $i < count($this->subprocesses_names); $i++) { + $next_process = $this->subprocesses[$this->subprocesses_names[$i]]; + if (true !== $this->tick_subprocess($next_process)) { + break; } - continue; + $this->push_process($next_process); + } + + // When the last process in the chain produces output, + // we write it to the stdout pipe and bale. + $data = $this->last_subprocess->stdout->read(); + if (null === $data || false === $data) { + break; } + $this->stdout->write($data, $this->tick_context); + return true; + } + + // We produced no output and the upstream pipe is EOF. + // We're done. + if(!$this->first_subprocess->is_alive()) { + $this->kill(0); + } + + return false; + } + + private function pop_process() + { + $name = $this->subprocesses_names[count($this->execution_stack) - 1]; + unset($this->tick_context[$name]); + return array_pop($this->execution_stack); + } + + private function push_process($process) + { + array_push($this->execution_stack, $process); + $name = $this->subprocesses_names[count($this->execution_stack) - 1]; + $this->tick_context[$name] = new TickContext($process); + } + + private function tick_subprocess($process) + { + $produced_output = $process->tick($this->tick_context); + $this->handle_errors($process); + return $produced_output; + } + + private function handle_errors($process) + { + if(!$process->has_crashed()) { while (true) { $err = $process->stderr->read(); if (null === $err || false === $err) { @@ -729,16 +842,16 @@ protected function do_tick($tick_context) { } } - while (true) { - $data = $this->last_subprocess->stdout->read(); - if (null === $data || false === $data) { - break; + if($process->has_crashed()) { + if (!$process->is_reaped()) { + $process->reap(); + $name = $this->subprocesses_names[array_search($process, $this->subprocesses)]; + $this->stderr->write("Process $name has crashed with code {$process->exit_code}", [ + 'type' => 'crash', + 'process' => $process, + 'reaped' => true, + ]); } - $this->stdout->write($data, $tick_context); - } - - if($this->last_subprocess->stdout->is_eof()) { - $this->kill(0); } } } @@ -773,36 +886,38 @@ private function __construct( $requests ) { protected function do_tick($tick_context) { - if ( ! $this->client->await_next_event() ) { - $this->kill(0); - return false; - } + while($this->client->await_next_event()) { + $request = $this->client->get_request(); + $output_channel = 'request_' . $request->id; + switch ($this->client->get_event()) { + case Client::EVENT_BODY_CHUNK_AVAILABLE: + $this->stdout->write($this->client->get_response_body_chunk(), [ + 'channel' => $output_channel, + 'request' => $request + ]); + return true; - $request = $this->client->get_request(); - $output_channel = 'request_' . $request->id; - switch ( $this->client->get_event() ) { - case Client::EVENT_BODY_CHUNK_AVAILABLE: - $this->stdout->write($this->client->get_response_body_chunk(), [ - 'channel' => $output_channel, - 'request' => $request - ]); - break; - case Client::EVENT_FAILED: - $this->stderr->write('Request failed: ' . $request->error, [ - 'request' => $request - ]); - $this->stdout->close_channel($output_channel); - break; - case Client::EVENT_FINISHED: - $this->stdout->close_channel($output_channel); - break; - } + case Client::EVENT_FAILED: + $this->stderr->write('Request failed: ' . $request->error, [ + 'request' => $request + ]); + $this->stdout->close_channel($output_channel); + break; + + case Client::EVENT_FINISHED: + $this->stdout->close_channel($output_channel); + break; + } + } + + $this->kill(0); + return false; } } -class XMLProcess extends TransformProcess { +class XMLProcess extends Process { private $xml_processor; private $node_visitor_callback; @@ -818,34 +933,65 @@ private function __construct( $node_visitor_callback ) { parent::__construct(); } - protected function transform($data, $tick_context) + protected function do_tick($tick_context) { + if(true === $this->process_buffered_data()) { + return true; + } + + if($this->stdin->is_eof()) { + $this->kill(0); + return false; + } + + $bytes = $this->stdin->read(); + if (null === $bytes || false === $bytes) { + return false; + } + + $this->xml_processor->stream_append_xml($bytes); + return $this->process_buffered_data(); + } + + private function process_buffered_data() { - $processor = $this->xml_processor; - if ( $processor->get_last_error() ) { + if($this->xml_processor->paused_at_incomplete_token()) { + return false; + } + + if ( $this->xml_processor->get_last_error() ) { $this->kill(1); - $this->stderr->write( $processor->get_last_error() ); + $this->stderr->write( $this->xml_processor->get_last_error() ); return false; } - $processor->stream_append_xml( $data ); - - $tokens_found = 0; - while ( $processor->next_token() ) { + $tokens_found = 0; + while ( $this->xml_processor->next_token() ) { ++ $tokens_found; $node_visitor_callback = $this->node_visitor_callback; - $node_visitor_callback( $processor ); + $node_visitor_callback( $this->xml_processor ); } $buffer = ''; if ( $tokens_found > 0 ) { - $buffer .= $processor->get_updated_xml(); - } else if ( $tokens_found === 0 || ! $processor->paused_at_incomplete_token() ) { - $buffer .= $processor->get_unprocessed_xml(); + $buffer .= $this->xml_processor->get_updated_xml(); + } else if ( + $tokens_found === 0 && + ! $this->xml_processor->paused_at_incomplete_token() && + $this->xml_processor->get_current_depth() === 0 + ) { + // We've reached the end of the document, let's finish up. + $buffer .= $this->xml_processor->get_unprocessed_xml(); + $this->kill(0); } - return $buffer; - } + if(!strlen($buffer)) { + return false; + } + + $this->stdout->write($buffer); + return true; + } } From bf37654bf32ef7224c19670311b9f40afbbb3629 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 23 Jul 2024 12:01:54 +0200 Subject: [PATCH 34/72] Update the note about Pipe and MultiChannelPipe --- pipes-unix.php | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pipes-unix.php b/pipes-unix.php index 5b364a5..2acf60f 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -27,6 +27,9 @@ * we could use metadata to pass the channel name, and the regular pipe would ignore it? * Maybe a MultiChannelPipe would just have special semantics for that metadata field? * And it would keep track of eofs etc using a set of internal Pipe instances? + * ^ Now that each chunk is moved downstream as soon as it's produced, we don't need + * to keep multiple buffers around. The only remaining advantage of a MultiChannelPipe + * is tracking EOF for each channel separately. * * Calling get_metadata() without calling read() first returns the last metadata. This * bit me a few times when I was in a context where I could not call read() first because, * e.g. another process was about to do that. Maybe this is a good thing, as it forces us From e856fbe8a7026299ac3533a58562bb2311ba5419 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 23 Jul 2024 12:06:52 +0200 Subject: [PATCH 35/72] Add more thoughts about the streamable interface --- pipes-unix.php | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pipes-unix.php b/pipes-unix.php index 2acf60f..12a5ade 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -10,6 +10,10 @@ * * The process `do_tick` method typically checks for `stdin->is_eof()` and then * whether `stdin->read()` is valid. Can we simplify this boilerplate somehow? * * Explore a shared "Streamable" interface for all stream processors (HTML, XML, ZIP, HTTP, etc.) + * ^ Would the "Process" have the same interface? A `tick()` seems isomorphic to + * "append_bytes()" call followed by "next()". There's a semantic difference in that + * "append_bytes()" pushes the data, while "tick()" pulls the data, but perhaps the push model + * would work better for asynchronous piping. * * ✅ Get rid of ProcessManager * * ✅ Get rid of stderr. We don't need it to be a stream. A single $error field + bubbling should do. * Let's keep stderr after all. From 9057f186816041a758c16b31c326b53b237f7ed4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 23 Jul 2024 12:08:36 +0200 Subject: [PATCH 36/72] Add more thoughts about demultiplexing --- pipes-unix.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipes-unix.php b/pipes-unix.php index 12a5ade..4ace4ea 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -26,7 +26,8 @@ * * Not writing bytes to a pipe but writing a new Chunk($bytes, $metadata) object to tightly couple the two * ^ the problem with this is that methods like `skip_file()` affect the currently processed file and we * must call them at the right time - * * Demultiplexing modes: per input channel, per $metadata['file_id']. + * * Demultiplexing modes: per "sequence_id" (e.g. ZIPping a sequence of files), per "file_id" + * (e.g. XML rewriting each file separately, regardless of the chunks order) * * Figure out interop Pipe and MultiChannelPipe – they are not interchangeable. Maybe * we could use metadata to pass the channel name, and the regular pipe would ignore it? * Maybe a MultiChannelPipe would just have special semantics for that metadata field? From 485ab61610fafa033eaeb3fc8850a6443a77c2ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 23 Jul 2024 13:03:23 +0200 Subject: [PATCH 37/72] Make $pipe->read() return true on success and add a $pipe->consume_bytes() method --- pipes-unix.php | 117 ++++++++++++++++++++++++++++++------------------- 1 file changed, 72 insertions(+), 45 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 4ace4ea..87054d1 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -46,10 +46,12 @@ * * ✅ Should Process::tick() return a boolean? Or is it fine if it doesn't return anything? * It now returns either "true", which means "I've produced output", or "false", which means * "I haven't produced output". - * * Pipe::read() returns a string on success, false on failure, or null if there were no writes - * since the last read and we'd just return an empty string. This three-state semantics is useful, - * but it's painful to always check for false and null, and then it may not interop well with - * PHP streams where fread() never returns null. Let's think this through some more. + * * ✅ Pipe::read() returns a string on success, false on failure, or null if there were no writes + * since the last read and we'd just return an empty string. This three-state semantics is useful, + * but it's painful to always check for false and null, and then it may not interop well with + * PHP streams where fread() never returns null. Let's think this through some more. + * ^ Pipe::read() now returns true, false, or null. When it returns true, the data is available + * for being consumed via $pipe->consume_bytes(). */ /** @@ -199,12 +201,11 @@ protected function do_tick($tick_context) { return false; } - $data = $this->stdin->read(); - if (null === $data || false === $data) { + if(true !== $this->stdin->read()) { return false; } - $transformed = $this->transform($data, $tick_context); + $transformed = $this->transform($this->stdin->consume_bytes(), $tick_context); if (null === $transformed || false === $transformed) { return false; } @@ -222,6 +223,7 @@ public function read(); public function write(string $data, $metadata=null); public function is_eof(); public function close(); + public function consume_bytes(); public function get_metadata(); } @@ -236,12 +238,20 @@ public function __construct($buffer = null) } public function read() { - $buffer = $this->buffer; - if(!$buffer && $this->closed) { + if(!$this->buffer && $this->closed) { return false; } + if(null === $this->buffer) { + return null; + } + return true; + } + + public function consume_bytes() + { + $bytes = $this->buffer; $this->buffer = null; - return $buffer; + return $bytes; } public function get_metadata() { @@ -271,6 +281,7 @@ public function close() { class ResourcePipe implements Pipe { public $resource; private bool $closed = false; + private $bytes; public function __construct($resource) { $this->resource = $resource; @@ -293,7 +304,18 @@ public function read() { return null; } - return $data; + if($this->bytes === null) { + $this->bytes = ''; + } + $this->bytes .= $data; + return true; + } + + public function consume_bytes() + { + $bytes = $this->bytes; + $this->bytes = null; + return $bytes; } public function write(string $data, $metadata=null) { @@ -330,7 +352,6 @@ public function __construct($filename, $mode) { * Idea 1: Use multiple pipes to pass multi-band I/O data between processes. */ class MultiChannelPipe implements Pipe { - public $metadata; private $used = false; private array $channels = []; private ?string $last_read_channel = 'default'; @@ -348,21 +369,33 @@ public function read() { return false; } - $this->metadata = null; $channels_to_check = $this->next_channels(); foreach($channels_to_check as $channel_name) { - $data = $this->channels[$channel_name]->read(); - if ($data === false || $data === null) { + if(true !== $this->channels[$channel_name]->read()) { continue; } $this->last_read_channel = $channel_name; - $this->metadata = $this->channels[$channel_name]->get_metadata(); - return $data; + return true; } return null; } + public function consume_bytes() + { + if(!$this->last_read_channel || !isset($this->channels[$this->last_read_channel])) { + return null; + } + return $this->channels[$this->last_read_channel]->consume_bytes(); + } + + public function get_metadata() { + if(!$this->last_read_channel || !isset($this->channels[$this->last_read_channel])) { + return null; + } + return $this->channels[$this->last_read_channel]->get_metadata(); + } + private function next_channels() { $channels_queue = []; $channel_names = array_keys($this->channels); @@ -387,9 +420,6 @@ private function next_channels() { return $channels_queue; } - public function get_metadata() { - return $this->metadata; - } public function write(string $data, $metadata = null) { $this->used = true; @@ -403,7 +433,7 @@ public function write(string $data, $metadata = null) { $this->channels[$current_channel] = new BufferPipe(); } - $this->metadata = $metadata; + $this->last_read_channel = $current_channel; return $this->channels[$current_channel]->write($data, $metadata); } @@ -507,11 +537,11 @@ protected function do_tick($tick_context) { return false; } - $next_chunk = $this->stdin->read(); - if (null === $next_chunk || false === $next_chunk) { + if (true !== $this->stdin->read()) { return false; } + $next_chunk = $this->stdin->consume_bytes(); $metadata = $this->stdin->get_metadata(); $input_channel = is_array($metadata) && !empty( $metadata['channel'] ) ? $metadata['channel'] : 'default'; $this->last_input_channel = $input_channel; @@ -538,8 +568,8 @@ private function tick_last_subprocess() return false; } - $output = $subprocess->stdout->read(); - if (null !== $output && false !== $output) { + if (true === $subprocess->stdout->read()) { + $output = $subprocess->stdout->consume_bytes(); $chunk_metadata = array_merge( ['channel' => $this->last_input_channel], $subprocess->stdout->get_metadata() ?? [], @@ -606,12 +636,11 @@ protected function do_tick($tick_context) { return false; } - $bytes = $this->stdin->read(); - if (null === $bytes || false === $bytes) { + if(true !== $this->stdin->read()) { return false; } - $this->reader->append_bytes($bytes); + $this->reader->append_bytes($this->stdin->consume_bytes()); return $this->process_buffered_data(); } @@ -754,11 +783,13 @@ protected function do_tick($tick_context) { } while(true) { - $data = $this->stdin->read(); - if (null === $data || false === $data) { + if(true !== $this->stdin->read()) { break; } - $this->first_subprocess->stdin->write($data, $this->stdin->get_metadata()); + $this->first_subprocess->stdin->write( + $this->stdin->consume_bytes(), + $this->stdin->get_metadata() + ); } if($this->stdin->is_eof()) { @@ -795,12 +826,13 @@ protected function do_tick($tick_context) { // When the last process in the chain produces output, // we write it to the stdout pipe and bale. - $data = $this->last_subprocess->stdout->read(); - if (null === $data || false === $data) { + if(true !== $this->last_subprocess->stdout->read()) { break; } - - $this->stdout->write($data, $this->tick_context); + $this->stdout->write( + $this->last_subprocess->stdout->consume_bytes(), + $this->tick_context + ); return true; } @@ -837,15 +869,11 @@ private function tick_subprocess($process) private function handle_errors($process) { if(!$process->has_crashed()) { - while (true) { - $err = $process->stderr->read(); - if (null === $err || false === $err) { - break; - } - $this->stderr->write($err, [ + while ($process->stderr->read()) { + $this->stderr->write($process->stderr->consume_bytes(), [ 'type' => 'error', 'process' => $process, - ...$process->stderr->get_metadata(), + ...($process->stderr->get_metadata() ?? []), ]); } } @@ -951,12 +979,11 @@ protected function do_tick($tick_context) { return false; } - $bytes = $this->stdin->read(); - if (null === $bytes || false === $bytes) { + if(true !== $this->stdin->read()) { return false; } - $this->xml_processor->stream_append_xml($bytes); + $this->xml_processor->stream_append_xml($this->stdin->consume_bytes()); return $this->process_buffered_data(); } From da29e85422f7a9498cf58ff0f8c9f8d9f5ea9154 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 24 Jul 2024 00:43:13 +0200 Subject: [PATCH 38/72] Add return type declarations --- pipes-unix.php | 211 +++++++++++++++++++++++-------------------------- 1 file changed, 99 insertions(+), 112 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 87054d1..b57c5be 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -7,13 +7,17 @@ * I only used it to make the development easier, I got confused with the other attempt in * `pipes.php` and this kept me on track. However, keeping these names will likely confuse others. * * Make Process implement the Iterator interface - * * The process `do_tick` method typically checks for `stdin->is_eof()` and then - * whether `stdin->read()` is valid. Can we simplify this boilerplate somehow? - * * Explore a shared "Streamable" interface for all stream processors (HTML, XML, ZIP, HTTP, etc.) - * ^ Would the "Process" have the same interface? A `tick()` seems isomorphic to - * "append_bytes()" call followed by "next()". There's a semantic difference in that - * "append_bytes()" pushes the data, while "tick()" pulls the data, but perhaps the push model - * would work better for asynchronous piping. + * * ✅ The process `do_tick` method typically checks for `stdin->is_eof()` and then + * whether `stdin->read()` is valid. Can we simplify this boilerplate somehow? + * ^ the BufferProcessor interface solves that problem. + * * ✅ Explore a shared "Streamable" interface for all stream processors (HTML, XML, ZIP, HTTP, etc.) + * ^ Would the "Process" have the same interface? A `tick()` seems isomorphic to + * "append_bytes()" call followed by "next()". There's a semantic difference in that + * "append_bytes()" pushes the data, while "tick()" pulls the data, but perhaps the push model + * would work better for asynchronous piping. + * ^^ A single interface for everything doesn't seem to cut it, but the BufferProcessor interface + * with `read()` and `write($bytes, $metadata)` methods seems to be a good fit for XML, ZIP, HTTP. + * It resembles a Pipe interface, too. I wonder if these "Process" classes could be pipes themselves. * * ✅ Get rid of ProcessManager * * ✅ Get rid of stderr. We don't need it to be a stream. A single $error field + bubbling should do. * Let's keep stderr after all. @@ -35,14 +39,8 @@ * ^ Now that each chunk is moved downstream as soon as it's produced, we don't need * to keep multiple buffers around. The only remaining advantage of a MultiChannelPipe * is tracking EOF for each channel separately. - * * Calling get_metadata() without calling read() first returns the last metadata. This - * bit me a few times when I was in a context where I could not call read() first because, - * e.g. another process was about to do that. Maybe this is a good thing, as it forces us - * to split a pipe in two whenever an intermediate read is involved, e.g. Process A wouldn't - * just connect it's stdin to a subprocess A.1, but it would read from stdin, read metadata, - * do processing, ant only then write to A.1 stdin. Still, a better error reporting wouldn't hurt. - * * Declare `bool` return type everywhere where it's missing. We may even remove it later for PHP BC, - * but let's still add it for a moment just to make sure we're not missing any typed return. + * * ✅ Declare `bool` return type everywhere where it's missing. We may even remove it later for PHP BC, + * but let's still add it for a moment just to make sure we're not missing any typed return. * * ✅ Should Process::tick() return a boolean? Or is it fine if it doesn't return anything? * It now returns either "true", which means "I've produced output", or "false", which means * "I haven't produced output". @@ -52,6 +50,15 @@ * PHP streams where fread() never returns null. Let's think this through some more. * ^ Pipe::read() now returns true, false, or null. When it returns true, the data is available * for being consumed via $pipe->consume_bytes(). + * + * Maybe not do these? + * + * * Calling get_metadata() without calling read() first returns the last metadata. This + * bit me a few times when I was in a context where I could not call read() first because, + * e.g. another process was about to do that. Maybe this is a good thing, as it forces us + * to split a pipe in two whenever an intermediate read is involved, e.g. Process A wouldn't + * just connect it's stdin to a subprocess A.1, but it would read from stdin, read metadata, + * do processing, ant only then write to A.1 stdin. Still, a better error reporting wouldn't hurt. */ /** @@ -138,12 +145,12 @@ public function __construct($stdin=null, $stdout=null, $stderr=null) public function run() { - do { + while ($this->is_alive()) { $this->tick(); - } while ($this->is_alive()); + } } - public function tick($tick_context=null) { + public function tick($tick_context=null): bool { if(!$this->is_alive()) { return false; } @@ -151,7 +158,7 @@ public function tick($tick_context=null) { return $this->do_tick($tick_context ?? []); } - abstract protected function do_tick($tick_context); + abstract protected function do_tick($tick_context): bool; public function kill($code) { $this->exit_code = $code; @@ -160,7 +167,7 @@ public function kill($code) { $this->stderr->close(); } - public function reap() + public function reap(): bool { if($this->is_alive()) { return false; @@ -170,16 +177,16 @@ public function reap() return true; } - public function is_reaped() + public function is_reaped(): bool { return $this->is_reaped; } - public function has_crashed() { + public function has_crashed(): bool { return $this->exit_code !== null && $this->exit_code !== 0; } - public function is_alive() { + public function is_alive(): bool { return $this->exit_code === null; } @@ -194,14 +201,39 @@ public function skip_file($file_id) { } -abstract class TransformProcess extends Process { - protected function do_tick($tick_context) { - if($this->stdin->is_eof()) { - $this->kill(0); +abstract class BufferProcessor extends Process +{ + protected function do_tick($tick_context): bool + { + if(true === $this->read()) { + return true; + } + + if (!$this->stdin->read()) { + if ($this->stdin->is_eof()) { + $this->kill(0); + } return false; } - if(true !== $this->stdin->read()) { + $this->write( + $this->stdin->consume_bytes(), + $this->stdin->get_metadata() + ); + + return $this->read(); + } + + abstract protected function write($input_chunk, $metadata); + abstract protected function read(): bool; +} + +abstract class TransformProcess extends Process { + protected function do_tick($tick_context): bool { + if(!$this->stdin->read()) { + if($this->stdin->is_eof()) { + $this->kill(0); + } return false; } @@ -219,9 +251,9 @@ abstract protected function transform($data, $tick_context); } interface Pipe { - public function read(); - public function write(string $data, $metadata=null); - public function is_eof(); + public function read(): ?bool; + public function write(string $data, $metadata=null): bool; + public function is_eof(): bool; public function close(); public function consume_bytes(); public function get_metadata(); @@ -237,7 +269,7 @@ public function __construct($buffer = null) $this->buffer = $buffer; } - public function read() { + public function read(): ?bool { if(!$this->buffer && $this->closed) { return false; } @@ -258,7 +290,7 @@ public function get_metadata() { return $this->metadata; } - public function write(string $data, $metadata=null) { + public function write(string $data, $metadata=null): bool { if($this->closed) { return false; } @@ -267,9 +299,10 @@ public function write(string $data, $metadata=null) { } $this->buffer .= $data; $this->metadata = $metadata; + return true; } - public function is_eof() { + public function is_eof(): bool { return null === $this->buffer && $this->closed; } @@ -287,7 +320,7 @@ public function __construct($resource) { $this->resource = $resource; } - public function read() { + public function read(): ?bool { if($this->closed) { return false; } @@ -318,18 +351,19 @@ public function consume_bytes() return $bytes; } - public function write(string $data, $metadata=null) { + public function write(string $data, $metadata=null): bool { if($this->closed) { return false; } fwrite($this->resource, $data); + return true; } public function get_metadata() { return null; } - public function is_eof() { + public function is_eof(): bool { return $this->closed; } @@ -356,15 +390,7 @@ class MultiChannelPipe implements Pipe { private array $channels = []; private ?string $last_read_channel = 'default'; - public function add_channel(string $name, $pipe = null) { - if(isset($this->channels[$name])) { - return false; - } - $this->channels[$name] = $pipe ?? new BufferPipe(); - return true; - } - - public function read() { + public function read(): ?bool { if (empty($this->channels)) { return false; } @@ -420,8 +446,7 @@ private function next_channels() { return $channels_queue; } - - public function write(string $data, $metadata = null) { + public function write(string $data, $metadata = null): bool { $this->used = true; $current_channel = 'default'; @@ -461,12 +486,7 @@ public function close_channel($channel_name) return $this->channels[$channel_name]->close(); } - public function get_channel_pipe($index) - { - return $this->channels[$index]; - } - - public function is_eof() { + public function is_eof(): bool { if(!$this->used) { return false; } @@ -486,6 +506,17 @@ public function close() { } } +/** + * Idea 2: Use a single pipe that keeps track of the `channel` and `file_id` metadata. + */ +// class MultiplexedPipe implements Pipe +// { +// public function write(string $data, $metadata = null) { +// $this->channel = $metadata['channel'] ?? 'default'; +// $this->file_id = $metadata['file_id'] ?? 'default'; +// } +// } + class Uppercaser extends TransformProcess { static public function stream() { @@ -514,7 +545,7 @@ protected function transform($data, $tick_context) { } } -class Demultiplexer extends Process { +class Demultiplexer extends BufferProcessor { private $process_factory = []; public $subprocesses = []; private $killed_subprocesses = []; @@ -527,22 +558,7 @@ public function __construct($process_factory) { parent::__construct(); } - protected function do_tick($tick_context) { - if(true === $this->tick_last_subprocess()) { - return true; - } - - if($this->stdin->is_eof() || $this->stdout->is_eof()) { - $this->kill(0); - return false; - } - - if (true !== $this->stdin->read()) { - return false; - } - - $next_chunk = $this->stdin->consume_bytes(); - $metadata = $this->stdin->get_metadata(); + protected function write($next_chunk, $metadata) { $input_channel = is_array($metadata) && !empty( $metadata['channel'] ) ? $metadata['channel'] : 'default'; $this->last_input_channel = $input_channel; if (!isset($this->subprocesses[$input_channel])) { @@ -553,11 +569,9 @@ protected function do_tick($tick_context) { $subprocess = $this->subprocesses[$input_channel]; $subprocess->stdin->write($next_chunk, $metadata); $this->last_subprocess = $subprocess; - - return $this->tick_last_subprocess(); } - private function tick_last_subprocess() + protected function read(): bool { $subprocess = $this->last_subprocess; if(!$subprocess) { @@ -607,7 +621,7 @@ public function skip_file($file_id) require __DIR__ . '/zip-stream-reader.php'; -class ZipReaderProcess extends Process { +class ZipReaderProcess extends BufferProcessor { private $reader; private $last_skipped_file = null; @@ -626,25 +640,11 @@ public function skip_file($file_id) $this->last_skipped_file = $file_id; } - protected function do_tick($tick_context) { - if(true === $this->process_buffered_data()) { - return true; - } - - if($this->stdin->is_eof()) { - $this->kill(0); - return false; - } - - if(true !== $this->stdin->read()) { - return false; - } - - $this->reader->append_bytes($this->stdin->consume_bytes()); - return $this->process_buffered_data(); + protected function write($bytes, $metadata) { + $this->reader->append_bytes($bytes); } - protected function process_buffered_data() + protected function read(): bool { while ($this->reader->next()) { switch ($this->reader->get_state()) { @@ -776,7 +776,7 @@ public function __construct($process_factories) { * This way we can maintain a predictable $context variable that carries upstream * metadata and exposes methods like skip_file(). */ - protected function do_tick($tick_context) { + protected function do_tick($tick_context): bool { if($this->last_subprocess->stdout->is_eof()) { $this->kill(0); return false; @@ -920,7 +920,7 @@ private function __construct( $requests ) { } } - protected function do_tick($tick_context) + protected function do_tick($tick_context): bool { while($this->client->await_next_event()) { $request = $this->client->get_request(); @@ -953,7 +953,7 @@ protected function do_tick($tick_context) } -class XMLProcess extends Process { +class XMLProcess extends BufferProcessor { private $xml_processor; private $node_visitor_callback; @@ -969,25 +969,12 @@ private function __construct( $node_visitor_callback ) { parent::__construct(); } - protected function do_tick($tick_context) { - if(true === $this->process_buffered_data()) { - return true; - } - - if($this->stdin->is_eof()) { - $this->kill(0); - return false; - } - - if(true !== $this->stdin->read()) { - return false; - } - - $this->xml_processor->stream_append_xml($this->stdin->consume_bytes()); - return $this->process_buffered_data(); + protected function write($bytes, $metadata) + { + $this->xml_processor->stream_append_xml($bytes); } - private function process_buffered_data() + protected function read(): bool { if($this->xml_processor->paused_at_incomplete_token()) { return false; From f557abc4ed045cf6950952a73bb10aa47e0463f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 24 Jul 2024 01:06:28 +0200 Subject: [PATCH 39/72] Ramble more in the todo comment --- pipes-unix.php | 62 +++++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index b57c5be..928985d 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -2,7 +2,36 @@ /** * @TODO: - * + * + * * Explore semantic updates to metadata: + * * Exposing metadata on a stream instance instead of a pipe. + * ^ With the new "execution stack" model, this seems like a great approach. + * $context['zip'] wouldn't be an abstract metadata array, but the actual ZipStreamReader instance + * with all the methods and properties available. + * * Not writing bytes to a pipe but writing a new Chunk($bytes, $metadata) object to tightly couple the two + * ^ the problem with this is that methods like `skip_file()` affect the currently processed file and we + * must call them at the right time + * * Demultiplexing modes: per "sequence_id" (e.g. ZIPping a sequence of files), per "file_id" + * (e.g. XML rewriting each file separately, regardless of the chunks order) + * * Figure out interop Pipe and MultiChannelPipe – they are not interchangeable. Maybe + * we could use metadata to pass the channel name, and the regular pipe would ignore it? + * Maybe a MultiChannelPipe would just have special semantics for that metadata field? + * And it would keep track of eofs etc using a set of internal Pipe instances? + * ^ Now that each chunk is moved downstream as soon as it's produced, we don't need + * to keep multiple buffers around. The only remaining advantage of a MultiChannelPipe + * is tracking EOF for each channel separately. + * ^ Do we need separate "pipes" at all? + * * The process chain semantics assumes every output chunk will be fully processed + * before the next one is produced. + * * Writing to a pipe before consuming its contents is an undefiend behavior similarly + * as with processes. + * * "pipes" are buffers and "processes" are buffers. + * * I can close a single channel in a pipe without closing the entire pipe or the next + * process. + * * A separate Pipe class encapsulates the writing and consumption logic. It wouldn't be + * handy to force that on every process. + * * But still, could we have a ProcessPipe class? And a PipeProcess class? + * * * Find a naming scheme that doesn't suggest we're working with actual Unix processes and pipes. * I only used it to make the development easier, I got confused with the other attempt in * `pipes.php` and this kept me on track. However, keeping these names will likely confuse others. @@ -22,23 +51,6 @@ * * ✅ Get rid of stderr. We don't need it to be a stream. A single $error field + bubbling should do. * Let's keep stderr after all. * * ✅ Remove these methods: set_write_channel, ensure_output_channel, add_output_channel, close_output_channel - * * Explore semantic updates to metadata: - * * Exposing metadata on a stream instance instead of a pipe. - * ^ With the new "execution stack" model, this seems like a great approach. - * $context['zip'] wouldn't be an abstract metadata array, but the actual ZipStreamReader instance - * with all the methods and properties available. - * * Not writing bytes to a pipe but writing a new Chunk($bytes, $metadata) object to tightly couple the two - * ^ the problem with this is that methods like `skip_file()` affect the currently processed file and we - * must call them at the right time - * * Demultiplexing modes: per "sequence_id" (e.g. ZIPping a sequence of files), per "file_id" - * (e.g. XML rewriting each file separately, regardless of the chunks order) - * * Figure out interop Pipe and MultiChannelPipe – they are not interchangeable. Maybe - * we could use metadata to pass the channel name, and the regular pipe would ignore it? - * Maybe a MultiChannelPipe would just have special semantics for that metadata field? - * And it would keep track of eofs etc using a set of internal Pipe instances? - * ^ Now that each chunk is moved downstream as soon as it's produced, we don't need - * to keep multiple buffers around. The only remaining advantage of a MultiChannelPipe - * is tracking EOF for each channel separately. * * ✅ Declare `bool` return type everywhere where it's missing. We may even remove it later for PHP BC, * but let's still add it for a moment just to make sure we're not missing any typed return. * * ✅ Should Process::tick() return a boolean? Or is it fine if it doesn't return anything? @@ -868,14 +880,12 @@ private function tick_subprocess($process) private function handle_errors($process) { - if(!$process->has_crashed()) { - while ($process->stderr->read()) { - $this->stderr->write($process->stderr->consume_bytes(), [ - 'type' => 'error', - 'process' => $process, - ...($process->stderr->get_metadata() ?? []), - ]); - } + while ($process->stderr->read()) { + $this->stderr->write($process->stderr->consume_bytes(), [ + 'type' => 'error', + 'process' => $process, + ...($process->stderr->get_metadata() ?? []), + ]); } if($process->has_crashed()) { From 765bcfc5c52ed03994fde205489ae60f75bd9efd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 24 Jul 2024 01:25:23 +0200 Subject: [PATCH 40/72] Rely on simple BufferedPipes, don't use MultiplexedPipe --- pipes-unix.php | 107 ++++++++++++------------------------------------- 1 file changed, 26 insertions(+), 81 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 928985d..e6fa57d 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -31,6 +31,9 @@ * * A separate Pipe class encapsulates the writing and consumption logic. It wouldn't be * handy to force that on every process. * * But still, could we have a ProcessPipe class? And a PipeProcess class? + * * Every Process needs a way to receive more data, emit its data, and emit errors. + * Currently we assume a tick() call that does $stdin->read(). We could have a public + * Process::write() method * * * Find a naming scheme that doesn't suggest we're working with actual Unix processes and pipes. * I only used it to make the development easier, I got confused with the other attempt in @@ -150,8 +153,8 @@ abstract class Process { public function __construct($stdin=null, $stdout=null, $stderr=null) { - $this->stdin = $stdin ?? new MultiChannelPipe(); - $this->stdout = $stdout ?? new MultiChannelPipe(); + $this->stdin = $stdin ?? new BufferPipe(); + $this->stdout = $stdout ?? new BufferPipe(); $this->stderr = $stderr ?? new BufferPipe(); } @@ -395,13 +398,19 @@ public function __construct($filename, $mode) { } /** - * Idea 1: Use multiple pipes to pass multi-band I/O data between processes. + * This isn't used anymore. Yay! It could be just removed, + * but it looks useful so let's keep it around for a while. */ -class MultiChannelPipe implements Pipe { +class MultiplexingPipe implements Pipe { private $used = false; private array $channels = []; private ?string $last_read_channel = 'default'; + public function __construct(array $pipes = []) + { + $this->channels = $pipes; + } + public function read(): ?bool { if (empty($this->channels)) { return false; @@ -409,7 +418,7 @@ public function read(): ?bool { $channels_to_check = $this->next_channels(); foreach($channels_to_check as $channel_name) { - if(true !== $this->channels[$channel_name]->read()) { + if(!$this->channels[$channel_name]->read()) { continue; } $this->last_read_channel = $channel_name; @@ -474,30 +483,6 @@ public function write(string $data, $metadata = null): bool { return $this->channels[$current_channel]->write($data, $metadata); } - public function ensure_channel($channel_name) - { - if (isset($this->channels[$channel_name])) { - return false; - } - $this->channels[$channel_name] = new BufferPipe(); - } - - public function is_channel_eof($channel_name) - { - if (!isset($this->channels[$channel_name])) { - return false; - } - return $this->channels[$channel_name]->is_eof(); - } - - public function close_channel($channel_name) - { - if (!isset($this->channels[$channel_name])) { - return false; - } - return $this->channels[$channel_name]->close(); - } - public function is_eof(): bool { if(!$this->used) { return false; @@ -518,17 +503,6 @@ public function close() { } } -/** - * Idea 2: Use a single pipe that keeps track of the `channel` and `file_id` metadata. - */ -// class MultiplexedPipe implements Pipe -// { -// public function write(string $data, $metadata = null) { -// $this->channel = $metadata['channel'] ?? 'default'; -// $this->file_id = $metadata['file_id'] ?? 'default'; -// } -// } - class Uppercaser extends TransformProcess { static public function stream() { @@ -590,20 +564,17 @@ protected function read(): bool return false; } - if(false === $subprocess->tick()) { + if(!$subprocess->tick()) { return false; } - if (true === $subprocess->stdout->read()) { + if ($subprocess->stdout->read()) { $output = $subprocess->stdout->consume_bytes(); $chunk_metadata = array_merge( ['channel' => $this->last_input_channel], $subprocess->stdout->get_metadata() ?? [], ); $this->stdout->write($output, $chunk_metadata); - if ($subprocess->stdout->is_channel_eof($chunk_metadata['channel'])) { - $this->stdout->close_channel($chunk_metadata['channel']); - } return true; } @@ -663,24 +634,12 @@ protected function read(): bool case ZipStreamReader::STATE_FILE_ENTRY: $file_path = $this->reader->get_file_path(); if ($this->last_skipped_file === $file_path) { - // break; + break; } $this->stdout->write($this->reader->get_file_body_chunk(), [ 'file_id' => $file_path, - // We don't want any single chunk to contain mixed bytes from - // multiple files. - // - // Therefore, we must either: - // - // * Use a separate channel for each file to have distinct - // buckets that don't mix. - // * Use a single channel and ensure the unzipped file is fully - // written and consumed before we start writing the next file. - // - // The second option requires more implementation complexity and also - // requires checking whether the output pipe has been read completely - // which is very specific to a BufferPipe. The first option seems simpler - // so let's go with that. + // Use a separate channel for each file so the next + // process may separate the files. 'channel' => $file_path, ]); return true; @@ -919,15 +878,6 @@ private function __construct( $requests ) { $this->client->enqueue( $requests ); parent::__construct(); - - // Pre-open all output channels to ensure the stdout stream - // stays open until all the requests conclude. Otherwise, - // we could have a window of time when some requests are done, - // others haven't started outputting yet, and the stdout stream - // is considered EOF. - foreach($requests as $request) { - $this->stdout->ensure_channel('request_' . $request->id); - } } protected function do_tick($tick_context): bool @@ -947,11 +897,6 @@ protected function do_tick($tick_context): bool $this->stderr->write('Request failed: ' . $request->error, [ 'request' => $request ]); - $this->stdout->close_channel($output_channel); - break; - - case Client::EVENT_FINISHED: - $this->stdout->close_channel($output_channel); break; } } @@ -1077,13 +1022,13 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { ]), 'zip' => ZipReaderProcess::stream(), - CallbackProcess::stream(function ($data, $context, $process) { - if ($context['zip']['file_id'] === 'export.wxr') { - $context['zip']->skip_file('export.wxr'); - return null; - } - return $data; - }), + // CallbackProcess::stream(function ($data, $context, $process) { + // if ($context['zip']['file_id'] === 'export.wxr') { + // $context['zip']->skip_file('export.wxr'); + // return null; + // } + // return $data; + // }), 'xml' => XMLProcess::stream($rewrite_links_in_wxr_node), Uppercaser::stream(), ]); From 760198506bb28d51d2d4d33f5499964fc4beeb44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 24 Jul 2024 01:46:03 +0200 Subject: [PATCH 41/72] Make ProcessChain an iterator --- pipes-unix.php | 244 +++++++++++++++++++++++++++++-------------------- 1 file changed, 146 insertions(+), 98 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index e6fa57d..fa51c6c 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -3,42 +3,49 @@ /** * @TODO: * - * * Explore semantic updates to metadata: + * * Find a naming scheme that doesn't suggest we're working with actual Unix processes and pipes. + * I only used it to make the development easier, I got confused with the other attempt in + * `pipes.php` and this kept me on track. However, keeping these names will likely confuse others. + * * ✅ Make ProcessChain implement the Iterator interface. Iterator semantics doesn't make + * as much sense on regular process classes because they may run out of input and they + * can't pull more bytes from the top of the stream. + * * ✅ Explore changes updates to metadata: * * Exposing metadata on a stream instance instead of a pipe. * ^ With the new "execution stack" model, this seems like a great approach. * $context['zip'] wouldn't be an abstract metadata array, but the actual ZipStreamReader instance * with all the methods and properties available. + * ^ Problem is, we want the next stream to have access to the metadata. + * ^ Well, writing metadata to stdout is the same as coupling it with the stream instance. + * Also, since we'll need to access the metadata later, even after it's been written to the next + * stream, we may need to keep the Pipe class around. * * Not writing bytes to a pipe but writing a new Chunk($bytes, $metadata) object to tightly couple the two * ^ the problem with this is that methods like `skip_file()` affect the currently processed file and we * must call them at the right time - * * Demultiplexing modes: per "sequence_id" (e.g. ZIPping a sequence of files), per "file_id" - * (e.g. XML rewriting each file separately, regardless of the chunks order) - * * Figure out interop Pipe and MultiChannelPipe – they are not interchangeable. Maybe - * we could use metadata to pass the channel name, and the regular pipe would ignore it? - * Maybe a MultiChannelPipe would just have special semantics for that metadata field? - * And it would keep track of eofs etc using a set of internal Pipe instances? - * ^ Now that each chunk is moved downstream as soon as it's produced, we don't need - * to keep multiple buffers around. The only remaining advantage of a MultiChannelPipe - * is tracking EOF for each channel separately. - * ^ Do we need separate "pipes" at all? - * * The process chain semantics assumes every output chunk will be fully processed - * before the next one is produced. - * * Writing to a pipe before consuming its contents is an undefiend behavior similarly - * as with processes. - * * "pipes" are buffers and "processes" are buffers. - * * I can close a single channel in a pipe without closing the entire pipe or the next - * process. - * * A separate Pipe class encapsulates the writing and consumption logic. It wouldn't be - * handy to force that on every process. - * * But still, could we have a ProcessPipe class? And a PipeProcess class? - * * Every Process needs a way to receive more data, emit its data, and emit errors. - * Currently we assume a tick() call that does $stdin->read(). We could have a public - * Process::write() method - * - * * Find a naming scheme that doesn't suggest we're working with actual Unix processes and pipes. - * I only used it to make the development easier, I got confused with the other attempt in - * `pipes.php` and this kept me on track. However, keeping these names will likely confuse others. - * * Make Process implement the Iterator interface + * * ✅ Demultiplexing modes: per "sequence_id" (e.g. ZIPping a sequence of files), per "file_id" + * (e.g. XML rewriting each file separately, regardless of the chunks order) + * ^ $key constructor argument handles that now + * * ✅ Figure out interop Pipe and MultiplexedPipe – they are not interchangeable. Maybe + * we could use metadata to pass the sequence name, and the regular pipe would ignore it? + * Maybe a MultiplexedPipe would just have special semantics for that metadata field? + * And it would keep track of eofs etc using a set of internal Pipe instances? + * ^ Now that each chunk is moved downstream as soon as it's produced, we don't need + * to keep multiple buffers around. The only remaining advantage of a MultiplexedPipe + * is tracking EOF for each sequence separately. + * ^ Do we need separate "pipes" at all? + * * The process chain semantics assumes every output chunk will be fully processed + * before the next one is produced. + * * Writing to a pipe before consuming its contents is an undefiend behavior similarly + * as with processes. + * * "pipes" are buffers and "processes" are buffers. + * * I can close a single sequence in a pipe without closing the entire pipe or the next + * process. + * * A separate Pipe class encapsulates the writing and consumption logic. It wouldn't be + * handy to force that on every process. + * * But still, could we have a ProcessPipe class? And a PipeProcess class? + * * Every Process needs a way to receive more data, emit its data, and emit errors. + * Currently we assume a tick() call that does $stdin->read(). We could have a public + * Process::write() method + * ^ MultiplexedPipe isn't used anymore * * ✅ The process `do_tick` method typically checks for `stdin->is_eof()` and then * whether `stdin->read()` is valid. Can we simplify this boilerplate somehow? * ^ the BufferProcessor interface solves that problem. @@ -53,7 +60,7 @@ * * ✅ Get rid of ProcessManager * * ✅ Get rid of stderr. We don't need it to be a stream. A single $error field + bubbling should do. * Let's keep stderr after all. - * * ✅ Remove these methods: set_write_channel, ensure_output_channel, add_output_channel, close_output_channel + * * ✅ Remove these methods: set_write_sequence, ensure_output_sequence, add_output_sequence, close_output_sequence * * ✅ Declare `bool` return type everywhere where it's missing. We may even remove it later for PHP BC, * but let's still add it for a moment just to make sure we're not missing any typed return. * * ✅ Should Process::tick() return a boolean? Or is it fine if it doesn't return anything? @@ -77,15 +84,15 @@ */ /** - * ## Demultiplexing modes: per input channel, per $metadata['file_id']. + * ## Demultiplexing modes: per input sequence, per $metadata['file_id']. * * We want to keep track of: - * * Stream ID – the sequential byte stream identifier. Multiple streams will produce - * file chunks in an arbitrary order and, when multiplexed, the chunks will be - * interleaved. - * * File ID – the file within that stream. A single stream may contain multiple files, - * but they will always be written sequentially. When multiplexed, one file will - * always be written completely before the next one is started. + * * Sequence ID – the sequential byte stream identifier. Multiple streams will produce + * file chunks in an arbitrary order and, when multiplexed, the chunks will be + * interleaved. + * * File ID – the file within that stream. A single stream may contain multiple files, + * but they will always be written sequentially. When multiplexed, one file will + * always be written completely before the next one is started. * * When a specific stream errors out, we need to communicate this * downstream and so the consumer processes can handle the error. @@ -403,25 +410,25 @@ public function __construct($filename, $mode) { */ class MultiplexingPipe implements Pipe { private $used = false; - private array $channels = []; - private ?string $last_read_channel = 'default'; + private array $sequences = []; + private ?string $last_read_sequence = 'default'; public function __construct(array $pipes = []) { - $this->channels = $pipes; + $this->sequences = $pipes; } public function read(): ?bool { - if (empty($this->channels)) { + if (empty($this->sequences)) { return false; } - $channels_to_check = $this->next_channels(); - foreach($channels_to_check as $channel_name) { - if(!$this->channels[$channel_name]->read()) { + $sequences_to_check = $this->next_sequences(); + foreach($sequences_to_check as $sequence_name) { + if(!$this->sequences[$sequence_name]->read()) { continue; } - $this->last_read_channel = $channel_name; + $this->last_read_sequence = $sequence_name; return true; } @@ -430,64 +437,64 @@ public function read(): ?bool { public function consume_bytes() { - if(!$this->last_read_channel || !isset($this->channels[$this->last_read_channel])) { + if(!$this->last_read_sequence || !isset($this->sequences[$this->last_read_sequence])) { return null; } - return $this->channels[$this->last_read_channel]->consume_bytes(); + return $this->sequences[$this->last_read_sequence]->consume_bytes(); } public function get_metadata() { - if(!$this->last_read_channel || !isset($this->channels[$this->last_read_channel])) { + if(!$this->last_read_sequence || !isset($this->sequences[$this->last_read_sequence])) { return null; } - return $this->channels[$this->last_read_channel]->get_metadata(); + return $this->sequences[$this->last_read_sequence]->get_metadata(); } - private function next_channels() { - $channels_queue = []; - $channel_names = array_keys($this->channels); - $last_read_channel_index = array_search($this->last_read_channel, $channel_names); - if(false === $last_read_channel_index) { - $last_read_channel_index = 0; - } else if($last_read_channel_index > count($channel_names)) { - $last_read_channel_index = count($channel_names) - 1; + private function next_sequences() { + $sequences_queue = []; + $sequence_names = array_keys($this->sequences); + $last_read_sequence_index = array_search($this->last_read_sequence, $sequence_names); + if(false === $last_read_sequence_index) { + $last_read_sequence_index = 0; + } else if($last_read_sequence_index > count($sequence_names)) { + $last_read_sequence_index = count($sequence_names) - 1; } - $this->last_read_channel = null; - for ($i = 1; $i <= count($channel_names); $i++) { - $key_index = ($last_read_channel_index + $i) % count($channel_names); - $channel_name = $channel_names[$key_index]; - if($this->channels[$channel_name]->is_eof()) { - unset($this->channels[$channel_name]); + $this->last_read_sequence = null; + for ($i = 1; $i <= count($sequence_names); $i++) { + $key_index = ($last_read_sequence_index + $i) % count($sequence_names); + $sequence_name = $sequence_names[$key_index]; + if($this->sequences[$sequence_name]->is_eof()) { + unset($this->sequences[$sequence_name]); continue; } - $this->last_read_channel = $channel_name; - $channels_queue[] = $channel_name; + $this->last_read_sequence = $sequence_name; + $sequences_queue[] = $sequence_name; } - return $channels_queue; + return $sequences_queue; } public function write(string $data, $metadata = null): bool { $this->used = true; - $current_channel = 'default'; + $current_sequence = 'default'; - if(is_array($metadata) && isset($metadata['channel'])) { - $current_channel = $metadata['channel']; + if(is_array($metadata) && isset($metadata['sequence'])) { + $current_sequence = $metadata['sequence']; } - if (!isset($this->channels[$current_channel])) { - $this->channels[$current_channel] = new BufferPipe(); + if (!isset($this->sequences[$current_sequence])) { + $this->sequences[$current_sequence] = new BufferPipe(); } - $this->last_read_channel = $current_channel; - return $this->channels[$current_channel]->write($data, $metadata); + $this->last_read_sequence = $current_sequence; + return $this->sequences[$current_sequence]->write($data, $metadata); } public function is_eof(): bool { if(!$this->used) { return false; } - foreach ($this->channels as $pipe) { + foreach ($this->sequences as $pipe) { if (!$pipe->is_eof()) { return false; } @@ -497,7 +504,7 @@ public function is_eof(): bool { public function close() { $this->used = true; - foreach ($this->channels as $pipe) { + foreach ($this->sequences as $pipe) { $pipe->close(); } } @@ -537,22 +544,24 @@ class Demultiplexer extends BufferProcessor { private $killed_subprocesses = []; private $demux_queue = []; private $last_subprocess; - private $last_input_channel; + private $last_input_key; + private $key; - public function __construct($process_factory) { + public function __construct($process_factory, $key = 'sequence') { $this->process_factory = $process_factory; + $this->key = $key; parent::__construct(); } protected function write($next_chunk, $metadata) { - $input_channel = is_array($metadata) && !empty( $metadata['channel'] ) ? $metadata['channel'] : 'default'; - $this->last_input_channel = $input_channel; - if (!isset($this->subprocesses[$input_channel])) { + $chunk_key = is_array($metadata) && !empty( $metadata[$this->key] ) ? $metadata[$this->key] : 'default'; + $this->last_input_key = $chunk_key; + if (!isset($this->subprocesses[$chunk_key])) { $factory = $this->process_factory; - $this->subprocesses[$input_channel] = $factory(); + $this->subprocesses[$chunk_key] = $factory(); } - $subprocess = $this->subprocesses[$input_channel]; + $subprocess = $this->subprocesses[$chunk_key]; $subprocess->stdin->write($next_chunk, $metadata); $this->last_subprocess = $subprocess; } @@ -571,7 +580,7 @@ protected function read(): bool if ($subprocess->stdout->read()) { $output = $subprocess->stdout->consume_bytes(); $chunk_metadata = array_merge( - ['channel' => $this->last_input_channel], + [$this->key => $this->last_input_key], $subprocess->stdout->get_metadata() ?? [], ); $this->stdout->write($output, $chunk_metadata); @@ -581,7 +590,7 @@ protected function read(): bool if (!$subprocess->is_alive()) { if ($subprocess->has_crashed()) { $this->stderr->write( - "Subprocess $this->last_input_channel has crashed with code {$subprocess->exit_code}", + "Subprocess $this->last_input_key has crashed with code {$subprocess->exit_code}", [ 'type' => 'crash', 'process' => $subprocess, @@ -638,9 +647,9 @@ protected function read(): bool } $this->stdout->write($this->reader->get_file_body_chunk(), [ 'file_id' => $file_path, - // Use a separate channel for each file so the next + // Use a separate sequence for each file so the next // process may separate the files. - 'channel' => $file_path, + 'sequence' => $file_path, ]); return true; } @@ -690,7 +699,7 @@ public function skip_file($file_id) } -class ProcessChain extends Process { +class ProcessChain extends Process implements Iterator { private $first_subprocess; private $last_subprocess; public $subprocesses = []; @@ -804,6 +813,7 @@ protected function do_tick($tick_context): bool { $this->last_subprocess->stdout->consume_bytes(), $this->tick_context ); + ++$this->chunk_nb; return true; } @@ -859,6 +869,42 @@ private function handle_errors($process) } } } + + // Iterator methods. These don't make much sense on a regular + // process class because they cannot pull more input chunks from + // the top of the stream like ProcessChain can. + + private $iterator_output_cache; + private $chunk_nb = -1; + public function current(): mixed { + if(null === $this->iterator_output_cache) { + $this->iterator_output_cache = $this->stdout->consume_bytes(); + } + return $this->iterator_output_cache; + } + + public function key(): mixed { + return $this->chunk_nb; + } + + public function rewind(): void { + $this->next(); + } + + public function next(): void { + $this->iterator_output_cache = null; + while(!$this->tick()) { + if(!$this->is_alive()) { + break; + } + usleep(10000); + } + } + + public function valid(): bool { + return $this->is_alive(); + } + } @@ -884,11 +930,11 @@ protected function do_tick($tick_context): bool { while($this->client->await_next_event()) { $request = $this->client->get_request(); - $output_channel = 'request_' . $request->id; + $output_sequence = 'request_' . $request->id; switch ($this->client->get_event()) { case Client::EVENT_BODY_CHUNK_AVAILABLE: $this->stdout->write($this->client->get_response_body_chunk(), [ - 'channel' => $output_channel, + 'sequence' => $output_sequence, 'request' => $request ]); return true; @@ -1022,19 +1068,21 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { ]), 'zip' => ZipReaderProcess::stream(), - // CallbackProcess::stream(function ($data, $context, $process) { - // if ($context['zip']['file_id'] === 'export.wxr') { - // $context['zip']->skip_file('export.wxr'); - // return null; - // } - // return $data; - // }), + CallbackProcess::stream(function ($data, $context, $process) { + if ($context['zip']['file_id'] === 'export.wxr') { + $context['zip']->skip_file('export.wxr'); + return null; + } + return $data; + }), 'xml' => XMLProcess::stream($rewrite_links_in_wxr_node), Uppercaser::stream(), ]); -$process->stdout = new FilePipe('php://stdout', 'w'); +// $process->stdout = new FilePipe('php://stdout', 'w'); $process->stderr = new FilePipe('php://stderr', 'w'); -$process->run(); +foreach($process as $k => $chunk) { + var_dump([$k => $chunk]); +} function log_process_chain_errors($process) { return; From bd0e81206f1a179b72514d7a97b7d9b81be80331 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 24 Jul 2024 01:50:34 +0200 Subject: [PATCH 42/72] Add commentary about merging pipes and processes --- pipes-unix.php | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pipes-unix.php b/pipes-unix.php index fa51c6c..b6e345c 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -6,6 +6,11 @@ * * Find a naming scheme that doesn't suggest we're working with actual Unix processes and pipes. * I only used it to make the development easier, I got confused with the other attempt in * `pipes.php` and this kept me on track. However, keeping these names will likely confuse others. + * * ✅ Explore merging Pipes and Processes into a single concept after all. + * Not doing that is nice, too. Writing to stdout is not equivalent to + * starting more computation downstream. Reading from stdin is not equivalent + * to trigerring more computations upstream. We get a buffer, a demilitarized + * zone between processes. Perhaps that's what was missing from the other experiment. * * ✅ Make ProcessChain implement the Iterator interface. Iterator semantics doesn't make * as much sense on regular process classes because they may run out of input and they * can't pull more bytes from the top of the stream. From c3565255ceb831701a2d5fc354dbac8dc623c646 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 30 Jul 2024 11:45:17 +0200 Subject: [PATCH 43/72] Expose the Process instance as $context, expose direct access to the underlying processor. --- pipes-unix.php | 100 +++++++++++++++++++++++++++---------------------- 1 file changed, 56 insertions(+), 44 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index b6e345c..e23ab07 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -156,7 +156,7 @@ use WordPress\AsyncHttp\Client; use WordPress\AsyncHttp\Request; -abstract class Process { +abstract class Process implements ArrayAccess { private ?int $exit_code = null; private bool $is_reaped = false; public Pipe $stdin; @@ -226,6 +226,23 @@ public function skip_file($file_id) { return false; } + + public function offsetExists($offset): bool { + return isset($this->stdout->get_metadata()[$offset]); + } + + public function offsetGet($offset): mixed { + return $this->stdout->get_metadata()[$offset] ?? null; + } + + public function offsetSet($offset, $value): void { + // No op + } + + public function offsetUnset($offset): void { + // No op + } + } abstract class BufferProcessor extends Process @@ -558,6 +575,11 @@ public function __construct($process_factory, $key = 'sequence') { parent::__construct(); } + public function get_subprocess() + { + return $this->last_subprocess; + } + protected function write($next_chunk, $metadata) { $chunk_key = is_array($metadata) && !empty( $metadata[$this->key] ) ? $metadata[$this->key] : 'default'; $this->last_input_key = $chunk_key; @@ -632,6 +654,11 @@ protected function __construct() { $this->reader = new ZipStreamReader(''); } + public function get_zip_reader() + { + return $this->reader; + } + public function skip_file($file_id) { $this->last_skipped_file = $file_id; @@ -664,46 +691,6 @@ protected function read(): bool } } -class TickContext implements ArrayAccess { - private $data; - public $process; - - public function offsetExists($offset): bool { - $this->get_metadata(); - return isset($this->data[$offset]); - } - - public function offsetGet($offset): mixed { - $this->get_metadata(); - return $this->data[$offset] ?? null; - } - - public function offsetSet($offset, $value): void { - $this->data[$offset] = $value; - } - - public function offsetUnset($offset): void { - unset($this->data[$offset]); - } - - public function __construct($process) - { - $this->process = $process; - } - - public function get_metadata() - { - $this->data = $this->process->stdout->get_metadata(); - return $this->data; - } - - public function skip_file($file_id) - { - return $this->process->skip_file($file_id); - } - -} - class ProcessChain extends Process implements Iterator { private $first_subprocess; private $last_subprocess; @@ -842,14 +829,14 @@ private function push_process($process) { array_push($this->execution_stack, $process); $name = $this->subprocesses_names[count($this->execution_stack) - 1]; - $this->tick_context[$name] = new TickContext($process); + $this->tick_context[$name] = $process; } private function tick_subprocess($process) { $produced_output = $process->tick($this->tick_context); $this->handle_errors($process); - return $produced_output; + return $produced_output; } private function handle_errors($process) @@ -910,6 +897,25 @@ public function valid(): bool { return $this->is_alive(); } + + // ArrayAccess on ProcessChain exposes specific + // sub-processes by their names. + public function offsetExists($offset): bool { + return isset($this->tick_context[$offset]); + } + + public function offsetGet($offset): mixed { + return $this->tick_context[$offset] ?? null; + } + + public function offsetSet($offset, $value): void { + // No op + } + + public function offsetUnset($offset): void { + // No op + } + } @@ -975,6 +981,11 @@ private function __construct( $node_visitor_callback ) { parent::__construct(); } + public function get_xml_processor() + { + return $this->xml_processor; + } + protected function write($bytes, $metadata) { $this->xml_processor->stream_append_xml($bytes); @@ -1074,10 +1085,11 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { 'zip' => ZipReaderProcess::stream(), CallbackProcess::stream(function ($data, $context, $process) { - if ($context['zip']['file_id'] === 'export.wxr') { + if ($context['zip']['file_id'] !== 'export.wxr') { $context['zip']->skip_file('export.wxr'); return null; } + print_r($context['zip']->get_subprocess()->get_zip_reader()->get_header()); return $data; }), 'xml' => XMLProcess::stream($rewrite_links_in_wxr_node), From 5b6ac94447cba600e657accfd4186b139c5ce741 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 30 Jul 2024 11:47:00 +0200 Subject: [PATCH 44/72] Unwrap the contextual process from Demultiplexer --- pipes-unix.php | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pipes-unix.php b/pipes-unix.php index e23ab07..1b4fbdc 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -829,6 +829,9 @@ private function push_process($process) { array_push($this->execution_stack, $process); $name = $this->subprocesses_names[count($this->execution_stack) - 1]; + if($process instanceof Demultiplexer) { + $process = $process->get_subprocess(); + } $this->tick_context[$name] = $process; } @@ -1089,7 +1092,7 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { $context['zip']->skip_file('export.wxr'); return null; } - print_r($context['zip']->get_subprocess()->get_zip_reader()->get_header()); + print_r($context['zip']->get_zip_reader()->get_header()); return $data; }), 'xml' => XMLProcess::stream($rewrite_links_in_wxr_node), From 9ec21c7c4b94dc8f9d8b9d0d8cbb4f2efc998343 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 30 Jul 2024 13:49:23 +0200 Subject: [PATCH 45/72] Make fds protected, not public. Make TransformProcessor a descendant of BufferProcessor --- pipes-unix.php | 94 +++++++++++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 40 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 1b4fbdc..70d067a 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -159,9 +159,9 @@ abstract class Process implements ArrayAccess { private ?int $exit_code = null; private bool $is_reaped = false; - public Pipe $stdin; - public Pipe $stdout; - public Pipe $stderr; + protected Pipe $stdin; + protected Pipe $stdout; + protected Pipe $stderr; public function __construct($stdin=null, $stdout=null, $stderr=null) { @@ -262,31 +262,42 @@ protected function do_tick($tick_context): bool $this->write( $this->stdin->consume_bytes(), - $this->stdin->get_metadata() + $this->stdin->get_metadata(), + $tick_context ); return $this->read(); } - abstract protected function write($input_chunk, $metadata); + abstract protected function write($input_chunk, $metadata, $tick_context); abstract protected function read(): bool; } -abstract class TransformProcess extends Process { - protected function do_tick($tick_context): bool { - if(!$this->stdin->read()) { - if($this->stdin->is_eof()) { - $this->kill(0); - } +abstract class TransformProcess extends BufferProcessor { + + protected $buffer; + protected $metadata; + protected $tick_context; + + protected function write($input_chunk, $metadata, $tick_context) + { + $this->buffer .= $input_chunk; + $this->metadata = $metadata; + $this->tick_context = $tick_context; + } + + protected function read(): bool + { + if(null === $this->buffer) { return false; } - - $transformed = $this->transform($this->stdin->consume_bytes(), $tick_context); + $transformed = $this->transform($this->buffer, $this->tick_context); + $this->buffer = null; if (null === $transformed || false === $transformed) { return false; } - $this->stdout->write($transformed, $this->stdin->get_metadata()); + $this->stdout->write($transformed, $this->metadata); return true; } @@ -546,7 +557,7 @@ class CallbackProcess extends TransformProcess { private $callback; static public function stream($callback) { - return fn () => new CallbackProcess($callback); + return fn () => new static($callback); } private function __construct($callback) { @@ -580,7 +591,7 @@ public function get_subprocess() return $this->last_subprocess; } - protected function write($next_chunk, $metadata) { + protected function write($next_chunk, $metadata, $tick_context) { $chunk_key = is_array($metadata) && !empty( $metadata[$this->key] ) ? $metadata[$this->key] : 'default'; $this->last_input_key = $chunk_key; if (!isset($this->subprocesses[$chunk_key])) { @@ -664,7 +675,7 @@ public function skip_file($file_id) $this->last_skipped_file = $file_id; } - protected function write($bytes, $metadata) { + protected function write($bytes, $metadata, $tick_context) { $this->reader->append_bytes($bytes); } @@ -700,8 +711,8 @@ class ProcessChain extends Process implements Iterator { private $execution_stack = []; private $tick_context = []; - public function __construct($process_factories) { - parent::__construct(); + public function __construct($process_factories, $stdin=null, $stdout=null, $stderr=null) { + parent::__construct($stdin, $stdout, $stderr); $last_process = null; $this->subprocesses_names = array_keys($process_factories); @@ -989,7 +1000,7 @@ public function get_xml_processor() return $this->xml_processor; } - protected function write($bytes, $metadata) + protected function write($bytes, $metadata, $tick_context) { $this->xml_processor->stream_append_xml($bytes); } @@ -1079,27 +1090,30 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { require __DIR__ . '/bootstrap.php'; -$process = new ProcessChain([ - HttpClientProcess::stream([ - new Request('http://127.0.0.1:9864/export.wxr.zip'), - // Bad request, will fail: - new Request('http://127.0.0.1:9865'), - ]), +$process = new ProcessChain( + [ + HttpClientProcess::stream([ + new Request('http://127.0.0.1:9864/export.wxr.zip'), + // Bad request, will fail: + new Request('http://127.0.0.1:9865'), + ]), - 'zip' => ZipReaderProcess::stream(), - CallbackProcess::stream(function ($data, $context, $process) { - if ($context['zip']['file_id'] !== 'export.wxr') { - $context['zip']->skip_file('export.wxr'); - return null; - } - print_r($context['zip']->get_zip_reader()->get_header()); - return $data; - }), - 'xml' => XMLProcess::stream($rewrite_links_in_wxr_node), - Uppercaser::stream(), -]); -// $process->stdout = new FilePipe('php://stdout', 'w'); -$process->stderr = new FilePipe('php://stderr', 'w'); + 'zip' => ZipReaderProcess::stream(), + CallbackProcess::stream(function ($data, $context, $process) { + if ($context['zip']['file_id'] !== 'export.wxr') { + $context['zip']->skip_file('export.wxr'); + return null; + } + print_r($context['zip']->get_zip_reader()->get_header()); + return $data; + }), + 'xml' => XMLProcess::stream($rewrite_links_in_wxr_node), + Uppercaser::stream(), + ], + null, + null, + new FilePipe('php://stderr', 'w') +); foreach($process as $k => $chunk) { var_dump([$k => $chunk]); } From 937afad0f56c6479c197f4d94337d9fc8a2e3223 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 30 Jul 2024 14:06:52 +0200 Subject: [PATCH 46/72] Rename stdin/stdout/stderr to input/output/errors --- pipes-unix.php | 149 ++++++++++++++++++++++++------------------------- 1 file changed, 74 insertions(+), 75 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 70d067a..0167d0f 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -7,8 +7,8 @@ * I only used it to make the development easier, I got confused with the other attempt in * `pipes.php` and this kept me on track. However, keeping these names will likely confuse others. * * ✅ Explore merging Pipes and Processes into a single concept after all. - * Not doing that is nice, too. Writing to stdout is not equivalent to - * starting more computation downstream. Reading from stdin is not equivalent + * Not doing that is nice, too. Writing to output is not equivalent to + * starting more computation downstream. Reading from input is not equivalent * to trigerring more computations upstream. We get a buffer, a demilitarized * zone between processes. Perhaps that's what was missing from the other experiment. * * ✅ Make ProcessChain implement the Iterator interface. Iterator semantics doesn't make @@ -20,7 +20,7 @@ * $context['zip'] wouldn't be an abstract metadata array, but the actual ZipStreamReader instance * with all the methods and properties available. * ^ Problem is, we want the next stream to have access to the metadata. - * ^ Well, writing metadata to stdout is the same as coupling it with the stream instance. + * ^ Well, writing metadata to output is the same as coupling it with the stream instance. * Also, since we'll need to access the metadata later, even after it's been written to the next * stream, we may need to keep the Pipe class around. * * Not writing bytes to a pipe but writing a new Chunk($bytes, $metadata) object to tightly couple the two @@ -48,11 +48,11 @@ * handy to force that on every process. * * But still, could we have a ProcessPipe class? And a PipeProcess class? * * Every Process needs a way to receive more data, emit its data, and emit errors. - * Currently we assume a tick() call that does $stdin->read(). We could have a public + * Currently we assume a tick() call that does $input->read(). We could have a public * Process::write() method * ^ MultiplexedPipe isn't used anymore - * * ✅ The process `do_tick` method typically checks for `stdin->is_eof()` and then - * whether `stdin->read()` is valid. Can we simplify this boilerplate somehow? + * * ✅ The process `do_tick` method typically checks for `input->is_eof()` and then + * whether `input->read()` is valid. Can we simplify this boilerplate somehow? * ^ the BufferProcessor interface solves that problem. * * ✅ Explore a shared "Streamable" interface for all stream processors (HTML, XML, ZIP, HTTP, etc.) * ^ Would the "Process" have the same interface? A `tick()` seems isomorphic to @@ -63,8 +63,8 @@ * with `read()` and `write($bytes, $metadata)` methods seems to be a good fit for XML, ZIP, HTTP. * It resembles a Pipe interface, too. I wonder if these "Process" classes could be pipes themselves. * * ✅ Get rid of ProcessManager - * * ✅ Get rid of stderr. We don't need it to be a stream. A single $error field + bubbling should do. - * Let's keep stderr after all. + * * ✅ Get rid of errors. We don't need it to be a stream. A single $error field + bubbling should do. + * Let's keep errors after all. * * ✅ Remove these methods: set_write_sequence, ensure_output_sequence, add_output_sequence, close_output_sequence * * ✅ Declare `bool` return type everywhere where it's missing. We may even remove it later for PHP BC, * but let's still add it for a moment just to make sure we're not missing any typed return. @@ -84,8 +84,8 @@ * bit me a few times when I was in a context where I could not call read() first because, * e.g. another process was about to do that. Maybe this is a good thing, as it forces us * to split a pipe in two whenever an intermediate read is involved, e.g. Process A wouldn't - * just connect it's stdin to a subprocess A.1, but it would read from stdin, read metadata, - * do processing, ant only then write to A.1 stdin. Still, a better error reporting wouldn't hurt. + * just connect it's input to a subprocess A.1, but it would read from input, read metadata, + * do processing, ant only then write to A.1 input. Still, a better error reporting wouldn't hurt. */ /** @@ -107,14 +107,14 @@ * about the open-ness or EOF-ness of its input and output pipes, * not about the actual lifecycle of the other processes. * - * However, we may want to correlate the same stream ID with stdout and - * stderr streams, in which case intertwining stream ID and process ID + * However, we may want to correlate the same stream ID with output and + * errors streams, in which case intertwining stream ID and process ID * would be useful. But then we don't have a 1:1 mapping between * what a data stream does and what a process does. * * Let's try these two approach and see where we get with it: * - * 1. Each process has a multiplexed stdin, stdout, and stderr pipes. + * 1. Each process has a multiplexed input, output, and errors pipes. * We do not use non-multiplexed pipes at all. Every process communicates * "there will be more output to come" by keeping at least one output * pipe open. Each process makes sure to react to sub-pipe state changes. @@ -131,26 +131,26 @@ /** - * ## Get rid of stderr. We don't need it to be a stream. A single $error field + bubbling should do. + * ## Get rid of errors. We don't need it to be a stream. A single $error field + bubbling should do. * - * Maybe stderr is fine after all? I'm no longer convinced about inventing a separate mechanism - * for error propagation. We'd have to implement a lot of the same features that stderr already + * Maybe errors is fine after all? I'm no longer convinced about inventing a separate mechanism + * for error propagation. We'd have to implement a lot of the same features that errors already * have. * - * Advantages of using stderr for propagating errors: + * Advantages of using errors for propagating errors: * * * We can bubble up multiple errors from a single process. * * They have metadata attached and are traceable to a specific process. - * * Piping to stderr doesn't imply the entire process have crashed, which we + * * Piping to errors doesn't imply the entire process have crashed, which we * wouldn't want in case of, say, Demultiplexer. - * * We clearly know when the errors are done, as stderr is a stream and we know + * * We clearly know when the errors are done, as errors is a stream and we know * when it's EOF. - * * We can put any pipe in place of stderr, e.g. a generic logger pipe + * * We can put any pipe in place of errors, e.g. a generic logger pipe * * Disadvantages: * * * Pipes have more features than error propagation uses, e.g. we rarely care - * for is_eof() on stderr, but we still have to close that errors pipe. + * for is_eof() on errors, but we still have to close that errors pipe. */ use WordPress\AsyncHttp\Client; @@ -159,15 +159,15 @@ abstract class Process implements ArrayAccess { private ?int $exit_code = null; private bool $is_reaped = false; - protected Pipe $stdin; - protected Pipe $stdout; - protected Pipe $stderr; + protected Pipe $input; + protected Pipe $output; + protected Pipe $errors; - public function __construct($stdin=null, $stdout=null, $stderr=null) + public function __construct($input=null, $output=null, $errors=null) { - $this->stdin = $stdin ?? new BufferPipe(); - $this->stdout = $stdout ?? new BufferPipe(); - $this->stderr = $stderr ?? new BufferPipe(); + $this->input = $input ?? new BufferPipe(); + $this->output = $output ?? new BufferPipe(); + $this->errors = $errors ?? new BufferPipe(); } public function run() @@ -189,9 +189,9 @@ abstract protected function do_tick($tick_context): bool; public function kill($code) { $this->exit_code = $code; - $this->stdin->close(); - $this->stdout->close(); - $this->stderr->close(); + $this->input->close(); + $this->output->close(); + $this->errors->close(); } public function reap(): bool @@ -228,11 +228,11 @@ public function skip_file($file_id) { public function offsetExists($offset): bool { - return isset($this->stdout->get_metadata()[$offset]); + return isset($this->output->get_metadata()[$offset]); } public function offsetGet($offset): mixed { - return $this->stdout->get_metadata()[$offset] ?? null; + return $this->output->get_metadata()[$offset] ?? null; } public function offsetSet($offset, $value): void { @@ -253,16 +253,16 @@ protected function do_tick($tick_context): bool return true; } - if (!$this->stdin->read()) { - if ($this->stdin->is_eof()) { + if (!$this->input->read()) { + if ($this->input->is_eof()) { $this->kill(0); } return false; } $this->write( - $this->stdin->consume_bytes(), - $this->stdin->get_metadata(), + $this->input->consume_bytes(), + $this->input->get_metadata(), $tick_context ); @@ -297,7 +297,7 @@ protected function read(): bool return false; } - $this->stdout->write($transformed, $this->metadata); + $this->output->write($transformed, $this->metadata); return true; } @@ -600,7 +600,7 @@ protected function write($next_chunk, $metadata, $tick_context) { } $subprocess = $this->subprocesses[$chunk_key]; - $subprocess->stdin->write($next_chunk, $metadata); + $subprocess->input->write($next_chunk, $metadata); $this->last_subprocess = $subprocess; } @@ -615,19 +615,19 @@ protected function read(): bool return false; } - if ($subprocess->stdout->read()) { - $output = $subprocess->stdout->consume_bytes(); + if ($subprocess->output->read()) { + $output = $subprocess->output->consume_bytes(); $chunk_metadata = array_merge( [$this->key => $this->last_input_key], - $subprocess->stdout->get_metadata() ?? [], + $subprocess->output->get_metadata() ?? [], ); - $this->stdout->write($output, $chunk_metadata); + $this->output->write($output, $chunk_metadata); return true; } if (!$subprocess->is_alive()) { if ($subprocess->has_crashed()) { - $this->stderr->write( + $this->errors->write( "Subprocess $this->last_input_key has crashed with code {$subprocess->exit_code}", [ 'type' => 'crash', @@ -688,7 +688,7 @@ protected function read(): bool if ($this->last_skipped_file === $file_path) { break; } - $this->stdout->write($this->reader->get_file_body_chunk(), [ + $this->output->write($this->reader->get_file_body_chunk(), [ 'file_id' => $file_path, // Use a separate sequence for each file so the next // process may separate the files. @@ -711,8 +711,8 @@ class ProcessChain extends Process implements Iterator { private $execution_stack = []; private $tick_context = []; - public function __construct($process_factories, $stdin=null, $stdout=null, $stderr=null) { - parent::__construct($stdin, $stdout, $stderr); + public function __construct($process_factories, $input=null, $output=null, $errors=null) { + parent::__construct($input, $output, $errors); $last_process = null; $this->subprocesses_names = array_keys($process_factories); @@ -725,7 +725,7 @@ public function __construct($process_factories, $stdin=null, $stdout=null, $stde $factory = $processes[$i]; $subprocess = $factory(); if(null !== $last_process) { - $subprocess->stdin = $last_process->stdout; + $subprocess->input = $last_process->output; } $this->subprocesses[$this->subprocesses_names[$i]] = $subprocess; $last_process = $subprocess; @@ -760,23 +760,23 @@ public function __construct($process_factories, $stdin=null, $stdout=null, $stde * metadata and exposes methods like skip_file(). */ protected function do_tick($tick_context): bool { - if($this->last_subprocess->stdout->is_eof()) { + if($this->last_subprocess->output->is_eof()) { $this->kill(0); return false; } while(true) { - if(true !== $this->stdin->read()) { + if(true !== $this->input->read()) { break; } - $this->first_subprocess->stdin->write( - $this->stdin->consume_bytes(), - $this->stdin->get_metadata() + $this->first_subprocess->input->write( + $this->input->consume_bytes(), + $this->input->get_metadata() ); } - if($this->stdin->is_eof()) { - $this->first_subprocess->stdin->close(); + if($this->input->is_eof()) { + $this->first_subprocess->input->close(); } if(empty($this->execution_stack)) { @@ -787,7 +787,7 @@ protected function do_tick($tick_context): bool { // Unpeel the context stack until we find a process that // produces output. $process = $this->pop_process(); - if ($process->stdout->is_eof()) { + if ($process->output->is_eof()) { continue; } @@ -808,12 +808,12 @@ protected function do_tick($tick_context): bool { } // When the last process in the chain produces output, - // we write it to the stdout pipe and bale. - if(true !== $this->last_subprocess->stdout->read()) { + // we write it to the output pipe and bale. + if(true !== $this->last_subprocess->output->read()) { break; } - $this->stdout->write( - $this->last_subprocess->stdout->consume_bytes(), + $this->output->write( + $this->last_subprocess->output->consume_bytes(), $this->tick_context ); ++$this->chunk_nb; @@ -855,11 +855,11 @@ private function tick_subprocess($process) private function handle_errors($process) { - while ($process->stderr->read()) { - $this->stderr->write($process->stderr->consume_bytes(), [ + while ($process->errors->read()) { + $this->errors->write($process->errors->consume_bytes(), [ 'type' => 'error', 'process' => $process, - ...($process->stderr->get_metadata() ?? []), + ...($process->errors->get_metadata() ?? []), ]); } @@ -867,7 +867,7 @@ private function handle_errors($process) if (!$process->is_reaped()) { $process->reap(); $name = $this->subprocesses_names[array_search($process, $this->subprocesses)]; - $this->stderr->write("Process $name has crashed with code {$process->exit_code}", [ + $this->errors->write("Process $name has crashed with code {$process->exit_code}", [ 'type' => 'crash', 'process' => $process, 'reaped' => true, @@ -884,7 +884,7 @@ private function handle_errors($process) private $chunk_nb = -1; public function current(): mixed { if(null === $this->iterator_output_cache) { - $this->iterator_output_cache = $this->stdout->consume_bytes(); + $this->iterator_output_cache = $this->output->consume_bytes(); } return $this->iterator_output_cache; } @@ -938,7 +938,6 @@ class HttpClientProcess extends Process { private $requests = []; private $child_contexts = []; private $skipped_requests = []; - private $errors = []; static public function stream($requests) { return fn () => new HttpClientProcess($requests); @@ -958,14 +957,14 @@ protected function do_tick($tick_context): bool $output_sequence = 'request_' . $request->id; switch ($this->client->get_event()) { case Client::EVENT_BODY_CHUNK_AVAILABLE: - $this->stdout->write($this->client->get_response_body_chunk(), [ + $this->output->write($this->client->get_response_body_chunk(), [ 'sequence' => $output_sequence, 'request' => $request ]); return true; case Client::EVENT_FAILED: - $this->stderr->write('Request failed: ' . $request->error, [ + $this->errors->write('Request failed: ' . $request->error, [ 'request' => $request ]); break; @@ -1013,7 +1012,7 @@ protected function read(): bool if ( $this->xml_processor->get_last_error() ) { $this->kill(1); - $this->stderr->write( $this->xml_processor->get_last_error() ); + $this->errors->write( $this->xml_processor->get_last_error() ); return false; } @@ -1041,7 +1040,7 @@ protected function read(): bool return false; } - $this->stdout->write($buffer); + $this->output->write($buffer); return true; } @@ -1120,18 +1119,18 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { function log_process_chain_errors($process) { return; - if(!($process->stderr instanceof BufferPipe)) { + if(!($process->errors instanceof BufferPipe)) { return; } - $error = $process->stderr->read(); + $error = $process->errors->read(); if ($error) { echo 'Error: ' . $error . "\n"; - $meta = $process->stderr->get_metadata(); + $meta = $process->errors->get_metadata(); if ($meta['type'] ?? '' === 'crash') { - $child_error = $meta['process']->stderr->read(); + $child_error = $meta['process']->errors->read(); if ($child_error) { - echo 'CRASH: ' . $meta['process']->stderr->read() . "\n"; + echo 'CRASH: ' . $meta['process']->errors->read() . "\n"; } } } From ddab3807b67e8812763c9e3076a544ce736362b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 30 Jul 2024 14:11:54 +0200 Subject: [PATCH 47/72] Remove the concept of reaping processes --- pipes-unix.php | 31 ++++--------------------------- 1 file changed, 4 insertions(+), 27 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 0167d0f..7dfeb80 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -158,7 +158,6 @@ abstract class Process implements ArrayAccess { private ?int $exit_code = null; - private bool $is_reaped = false; protected Pipe $input; protected Pipe $output; protected Pipe $errors; @@ -170,13 +169,6 @@ public function __construct($input=null, $output=null, $errors=null) $this->errors = $errors ?? new BufferPipe(); } - public function run() - { - while ($this->is_alive()) { - $this->tick(); - } - } - public function tick($tick_context=null): bool { if(!$this->is_alive()) { return false; @@ -192,21 +184,7 @@ public function kill($code) { $this->input->close(); $this->output->close(); $this->errors->close(); - } - - public function reap(): bool - { - if($this->is_alive()) { - return false; - } - $this->is_reaped = true; $this->cleanup(); - return true; - } - - public function is_reaped(): bool - { - return $this->is_reaped; } public function has_crashed(): bool { @@ -707,7 +685,7 @@ class ProcessChain extends Process implements Iterator { private $last_subprocess; public $subprocesses = []; public $subprocesses_names = []; - private $reaped_pids = []; + private $reaped_subprocesses = []; private $execution_stack = []; private $tick_context = []; @@ -864,14 +842,13 @@ private function handle_errors($process) } if($process->has_crashed()) { - if (!$process->is_reaped()) { - $process->reap(); - $name = $this->subprocesses_names[array_search($process, $this->subprocesses)]; + $name = $this->subprocesses_names[array_search($process, $this->subprocesses)]; + if(!isset($this->reaped_subprocesses[$name])) { $this->errors->write("Process $name has crashed with code {$process->exit_code}", [ 'type' => 'crash', 'process' => $process, - 'reaped' => true, ]); + $this->reaped_subprocesses[$name] = true; } } } From 312cbf1ddcffc41ae09cb737f1b4f364203a49ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 30 Jul 2024 14:30:48 +0200 Subject: [PATCH 48/72] Separate the crash() and finish() methods --- pipes-unix.php | 72 +++++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 7dfeb80..c17de64 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -157,7 +157,13 @@ use WordPress\AsyncHttp\Request; abstract class Process implements ArrayAccess { - private ?int $exit_code = null; + + const STATE_STREAMING = '#streaming'; + const STATE_FINISHED = '#finished'; + const STATE_CRASHED = '#crashed'; + + private string $state = self::STATE_STREAMING; + protected Pipe $input; protected Pipe $output; protected Pipe $errors; @@ -179,32 +185,35 @@ public function tick($tick_context=null): bool { abstract protected function do_tick($tick_context): bool; - public function kill($code) { - $this->exit_code = $code; - $this->input->close(); - $this->output->close(); - $this->errors->close(); + protected function crash( $error_message = null ) + { + if($error_message) { + $this->errors->write( $error_message ); + } + $this->state = self::STATE_CRASHED; $this->cleanup(); } - public function has_crashed(): bool { - return $this->exit_code !== null && $this->exit_code !== 0; + protected function finish() { + $this->state = self::STATE_FINISHED; + $this->cleanup(); } - public function is_alive(): bool { - return $this->exit_code === null; + protected function cleanup() + { + $this->input->close(); + $this->output->close(); + $this->errors->close(); } - protected function cleanup() { - // clean up resources + public function has_crashed(): bool { + return $this->state === self::STATE_CRASHED; } - public function skip_file($file_id) { - // Needs to be implemented by subclasses - return false; + public function is_alive(): bool { + return $this->state === self::STATE_STREAMING; } - public function offsetExists($offset): bool { return isset($this->output->get_metadata()[$offset]); } @@ -233,7 +242,7 @@ protected function do_tick($tick_context): bool if (!$this->input->read()) { if ($this->input->is_eof()) { - $this->kill(0); + $this->finish(); } return false; } @@ -606,7 +615,7 @@ protected function read(): bool if (!$subprocess->is_alive()) { if ($subprocess->has_crashed()) { $this->errors->write( - "Subprocess $this->last_input_key has crashed with code {$subprocess->exit_code}", + "Subprocess $this->last_input_key has crashed", [ 'type' => 'crash', 'process' => $subprocess, @@ -739,7 +748,7 @@ public function __construct($process_factories, $input=null, $output=null, $erro */ protected function do_tick($tick_context): bool { if($this->last_subprocess->output->is_eof()) { - $this->kill(0); + $this->finish(); return false; } @@ -801,7 +810,7 @@ protected function do_tick($tick_context): bool { // We produced no output and the upstream pipe is EOF. // We're done. if(!$this->first_subprocess->is_alive()) { - $this->kill(0); + $this->finish(); } return false; @@ -833,23 +842,21 @@ private function tick_subprocess($process) private function handle_errors($process) { - while ($process->errors->read()) { - $this->errors->write($process->errors->consume_bytes(), [ - 'type' => 'error', - 'process' => $process, - ...($process->errors->get_metadata() ?? []), - ]); - } - if($process->has_crashed()) { $name = $this->subprocesses_names[array_search($process, $this->subprocesses)]; if(!isset($this->reaped_subprocesses[$name])) { - $this->errors->write("Process $name has crashed with code {$process->exit_code}", [ + $this->errors->write("Process $name has crashed", [ 'type' => 'crash', 'process' => $process, ]); $this->reaped_subprocesses[$name] = true; } + } else if ($process->errors->read()) { + $this->errors->write($process->errors->consume_bytes(), [ + 'type' => 'error', + 'process' => $process, + ...($process->errors->get_metadata() ?? []), + ]); } } @@ -948,7 +955,7 @@ protected function do_tick($tick_context): bool } } - $this->kill(0); + $this->finish(); return false; } @@ -988,8 +995,7 @@ protected function read(): bool } if ( $this->xml_processor->get_last_error() ) { - $this->kill(1); - $this->errors->write( $this->xml_processor->get_last_error() ); + $this->crash( $this->xml_processor->get_last_error() ); return false; } @@ -1010,7 +1016,7 @@ protected function read(): bool ) { // We've reached the end of the document, let's finish up. $buffer .= $this->xml_processor->get_unprocessed_xml(); - $this->kill(0); + $this->finish(); } if(!strlen($buffer)) { From 342f927747f09f845e26ee7963a56c575050d180 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 30 Jul 2024 14:32:55 +0200 Subject: [PATCH 49/72] Rename Process to Stream --- pipes-unix.php | 67 ++++++++++++++++---------------------------------- 1 file changed, 21 insertions(+), 46 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index c17de64..8c0f376 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -156,7 +156,7 @@ use WordPress\AsyncHttp\Client; use WordPress\AsyncHttp\Request; -abstract class Process implements ArrayAccess { +abstract class Stream implements ArrayAccess { const STATE_STREAMING = '#streaming'; const STATE_FINISHED = '#finished'; @@ -232,7 +232,7 @@ public function offsetUnset($offset): void { } -abstract class BufferProcessor extends Process +abstract class BufferStream extends Stream { protected function do_tick($tick_context): bool { @@ -260,7 +260,7 @@ abstract protected function write($input_chunk, $metadata, $tick_context); abstract protected function read(): bool; } -abstract class TransformProcess extends BufferProcessor { +abstract class TransformerStream extends BufferStream { protected $buffer; protected $metadata; @@ -530,17 +530,7 @@ public function close() { } } - -class Uppercaser extends TransformProcess { - static public function stream() { - return fn() => new static(); - } - protected function transform($data, $tick_context) { - return strtoupper($data); - } -} - -class CallbackProcess extends TransformProcess { +class CallbackStream extends TransformerStream { private $callback; static public function stream($callback) { @@ -558,7 +548,7 @@ protected function transform($data, $tick_context) { } } -class Demultiplexer extends BufferProcessor { +class Demultiplexer extends BufferStream { private $process_factory = []; public $subprocesses = []; private $killed_subprocesses = []; @@ -638,13 +628,13 @@ public function skip_file($file_id) require __DIR__ . '/zip-stream-reader.php'; -class ZipReaderProcess extends BufferProcessor { +class ZipReader extends BufferStream { private $reader; private $last_skipped_file = null; static public function stream() { - return fn () => new Demultiplexer(fn() => new ZipReaderProcess()); + return fn () => new Demultiplexer(fn() => new ZipReader()); } protected function __construct() { @@ -689,7 +679,7 @@ protected function read(): bool } } -class ProcessChain extends Process implements Iterator { +class StreamChain extends Stream implements Iterator { private $first_subprocess; private $last_subprocess; public $subprocesses = []; @@ -917,14 +907,14 @@ public function offsetUnset($offset): void { } -class HttpClientProcess extends Process { +class HttpStream extends Stream { private $client; private $requests = []; private $child_contexts = []; private $skipped_requests = []; static public function stream($requests) { - return fn () => new HttpClientProcess($requests); + return fn () => new HttpStream($requests); } private function __construct( $requests ) { @@ -962,13 +952,13 @@ protected function do_tick($tick_context): bool } -class XMLProcess extends BufferProcessor { +class XMLStream extends BufferStream { private $xml_processor; private $node_visitor_callback; static public function stream($node_visitor_callback) { return fn () => new Demultiplexer(fn () => - new XMLProcess($node_visitor_callback) + new XMLStream($node_visitor_callback) ); } @@ -1072,16 +1062,16 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { require __DIR__ . '/bootstrap.php'; -$process = new ProcessChain( +$process = new StreamChain( [ - HttpClientProcess::stream([ + HttpStream::stream([ new Request('http://127.0.0.1:9864/export.wxr.zip'), // Bad request, will fail: new Request('http://127.0.0.1:9865'), ]), - 'zip' => ZipReaderProcess::stream(), - CallbackProcess::stream(function ($data, $context, $process) { + 'zip' => ZipReader::stream(), + CallbackStream::stream(function ($data, $context, $process) { if ($context['zip']['file_id'] !== 'export.wxr') { $context['zip']->skip_file('export.wxr'); return null; @@ -1089,32 +1079,17 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { print_r($context['zip']->get_zip_reader()->get_header()); return $data; }), - 'xml' => XMLProcess::stream($rewrite_links_in_wxr_node), - Uppercaser::stream(), + 'xml' => XMLStream::stream($rewrite_links_in_wxr_node), + CallbackStream::stream(function ($data, $context, $process) { + return strtoupper($data); + }) ], null, null, new FilePipe('php://stderr', 'w') ); + foreach($process as $k => $chunk) { var_dump([$k => $chunk]); } -function log_process_chain_errors($process) { - return; - if(!($process->errors instanceof BufferPipe)) { - return; - } - - $error = $process->errors->read(); - if ($error) { - echo 'Error: ' . $error . "\n"; - $meta = $process->errors->get_metadata(); - if ($meta['type'] ?? '' === 'crash') { - $child_error = $meta['process']->errors->read(); - if ($child_error) { - echo 'CRASH: ' . $meta['process']->errors->read() . "\n"; - } - } - } -} From f925c4364631ffc39305e1ae84c13f5c9a36a1d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 30 Jul 2024 14:41:36 +0200 Subject: [PATCH 50/72] Inline the XML rewriting callback --- pipes-unix.php | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 8c0f376..3f3f4f1 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -1049,16 +1049,6 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { return false; }; -$rewrite_links_in_wxr_node = function (WP_XML_Processor $processor) { - if (is_wxr_content_node($processor)) { - $text = $processor->get_modifiable_text(); - $updated_text = 'Hey there, what\'s up?'; - if ($updated_text !== $text) { - $processor->set_modifiable_text($updated_text); - } - } -}; - require __DIR__ . '/bootstrap.php'; @@ -1069,7 +1059,6 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { // Bad request, will fail: new Request('http://127.0.0.1:9865'), ]), - 'zip' => ZipReader::stream(), CallbackStream::stream(function ($data, $context, $process) { if ($context['zip']['file_id'] !== 'export.wxr') { @@ -1079,7 +1068,15 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { print_r($context['zip']->get_zip_reader()->get_header()); return $data; }), - 'xml' => XMLStream::stream($rewrite_links_in_wxr_node), + 'xml' => XMLStream::stream(function (WP_XML_Processor $processor) { + if (is_wxr_content_node($processor)) { + $text = $processor->get_modifiable_text(); + $updated_text = 'Hey there, what\'s up?'; + if ($updated_text !== $text) { + $processor->set_modifiable_text($updated_text); + } + } + }), CallbackStream::stream(function ($data, $context, $process) { return strtoupper($data); }) From fb95720c46fefdaa622d0cc3ad8d84d351a9f075 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 30 Jul 2024 14:45:33 +0200 Subject: [PATCH 51/72] Make all public fields protected or private --- pipes-unix.php | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 3f3f4f1..4e96f2a 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -302,8 +302,8 @@ public function get_metadata(); } class BufferPipe implements Pipe { - public ?string $buffer = null; - public $metadata = null; + private ?string $buffer = null; + private $metadata = null; private bool $closed = false; public function __construct($buffer = null) @@ -354,7 +354,7 @@ public function close() { } class ResourcePipe implements Pipe { - public $resource; + private $resource; private bool $closed = false; private $bytes; @@ -550,7 +550,7 @@ protected function transform($data, $tick_context) { class Demultiplexer extends BufferStream { private $process_factory = []; - public $subprocesses = []; + private $subprocesses = []; private $killed_subprocesses = []; private $demux_queue = []; private $last_subprocess; @@ -682,8 +682,8 @@ protected function read(): bool class StreamChain extends Stream implements Iterator { private $first_subprocess; private $last_subprocess; - public $subprocesses = []; - public $subprocesses_names = []; + private $subprocesses = []; + private $subprocesses_names = []; private $reaped_subprocesses = []; private $execution_stack = []; private $tick_context = []; @@ -1087,6 +1087,9 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { ); foreach($process as $k => $chunk) { - var_dump([$k => $chunk]); + var_dump([ + $k => $chunk, + 'zip file_id' => $process['zip']['file_id'] + ]); } From 73369ba02657153d20d43dbc6fb9d66093822ede Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 30 Jul 2024 14:50:46 +0200 Subject: [PATCH 52/72] Finish renaming processes to streams --- pipes-unix.php | 118 ++++++++++++++++++++++++------------------------- 1 file changed, 59 insertions(+), 59 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 4e96f2a..30040e4 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -533,7 +533,7 @@ public function close() { class CallbackStream extends TransformerStream { private $callback; - static public function stream($callback) { + static public function factory($callback) { return fn () => new static($callback); } @@ -549,7 +549,7 @@ protected function transform($data, $tick_context) { } class Demultiplexer extends BufferStream { - private $process_factory = []; + private $stream_factory = []; private $subprocesses = []; private $killed_subprocesses = []; private $demux_queue = []; @@ -557,8 +557,8 @@ class Demultiplexer extends BufferStream { private $last_input_key; private $key; - public function __construct($process_factory, $key = 'sequence') { - $this->process_factory = $process_factory; + public function __construct($stream_factory, $key = 'sequence') { + $this->stream_factory = $stream_factory; $this->key = $key; parent::__construct(); } @@ -572,7 +572,7 @@ protected function write($next_chunk, $metadata, $tick_context) { $chunk_key = is_array($metadata) && !empty( $metadata[$this->key] ) ? $metadata[$this->key] : 'default'; $this->last_input_key = $chunk_key; if (!isset($this->subprocesses[$chunk_key])) { - $factory = $this->process_factory; + $factory = $this->stream_factory; $this->subprocesses[$chunk_key] = $factory(); } @@ -633,7 +633,7 @@ class ZipReader extends BufferStream { private $reader; private $last_skipped_file = null; - static public function stream() { + static public function factory() { return fn () => new Demultiplexer(fn() => new ZipReader()); } @@ -682,34 +682,34 @@ protected function read(): bool class StreamChain extends Stream implements Iterator { private $first_subprocess; private $last_subprocess; - private $subprocesses = []; - private $subprocesses_names = []; - private $reaped_subprocesses = []; + private $streams = []; + private $streams_names = []; + private $finished_streams = []; private $execution_stack = []; private $tick_context = []; - public function __construct($process_factories, $input=null, $output=null, $errors=null) { + public function __construct($streams_factories, $input=null, $output=null, $errors=null) { parent::__construct($input, $output, $errors); $last_process = null; - $this->subprocesses_names = array_keys($process_factories); - foreach($this->subprocesses_names as $k => $name) { - $this->subprocesses_names[$k] = $name . ''; + $this->streams_names = array_keys($streams_factories); + foreach($this->streams_names as $k => $name) { + $this->streams_names[$k] = $name . ''; } - $processes = array_values($process_factories); - for($i = 0; $i < count($process_factories); $i++) { - $factory = $processes[$i]; + $streams = array_values($streams_factories); + for($i = 0; $i < count($streams_factories); $i++) { + $factory = $streams[$i]; $subprocess = $factory(); if(null !== $last_process) { $subprocess->input = $last_process->output; } - $this->subprocesses[$this->subprocesses_names[$i]] = $subprocess; + $this->streams[$this->streams_names[$i]] = $subprocess; $last_process = $subprocess; } - $this->first_subprocess = $this->subprocesses[$this->subprocesses_names[0]]; - $this->last_subprocess = $this->subprocesses[$this->subprocesses_names[count($process_factories) - 1]]; + $this->first_subprocess = $this->streams[$this->streams_names[0]]; + $this->last_subprocess = $this->streams[$this->streams_names[count($streams_factories) - 1]]; } /** @@ -761,27 +761,27 @@ protected function do_tick($tick_context): bool { } while (count($this->execution_stack)) { - // Unpeel the context stack until we find a process that + // Unpeel the context stack until we find a stream that // produces output. - $process = $this->pop_process(); - if ($process->output->is_eof()) { + $stream = $this->pop_stream(); + if ($stream->output->is_eof()) { continue; } - if(true !== $this->tick_subprocess($process)) { + if(true !== $this->tick_stream($stream)) { continue; } // We've got output from the process, yay! Let's // propagate it downstream. - $this->push_process($process); + $this->push_stream($stream); - for ($i = count($this->execution_stack); $i < count($this->subprocesses_names); $i++) { - $next_process = $this->subprocesses[$this->subprocesses_names[$i]]; - if (true !== $this->tick_subprocess($next_process)) { + for ($i = count($this->execution_stack); $i < count($this->streams_names); $i++) { + $next_process = $this->streams[$this->streams_names[$i]]; + if (true !== $this->tick_stream($next_process)) { break; } - $this->push_process($next_process); + $this->push_stream($next_process); } // When the last process in the chain produces output, @@ -806,46 +806,46 @@ protected function do_tick($tick_context): bool { return false; } - private function pop_process() + private function pop_stream() { - $name = $this->subprocesses_names[count($this->execution_stack) - 1]; + $name = $this->streams_names[count($this->execution_stack) - 1]; unset($this->tick_context[$name]); return array_pop($this->execution_stack); } - private function push_process($process) + private function push_stream($stream) { - array_push($this->execution_stack, $process); - $name = $this->subprocesses_names[count($this->execution_stack) - 1]; - if($process instanceof Demultiplexer) { - $process = $process->get_subprocess(); + array_push($this->execution_stack, $stream); + $name = $this->streams_names[count($this->execution_stack) - 1]; + if($stream instanceof Demultiplexer) { + $stream = $stream->get_subprocess(); } - $this->tick_context[$name] = $process; + $this->tick_context[$name] = $stream; } - private function tick_subprocess($process) + private function tick_stream($stream) { - $produced_output = $process->tick($this->tick_context); - $this->handle_errors($process); + $produced_output = $stream->tick($this->tick_context); + $this->handle_errors($stream); return $produced_output; } - private function handle_errors($process) + private function handle_errors($stream) { - if($process->has_crashed()) { - $name = $this->subprocesses_names[array_search($process, $this->subprocesses)]; - if(!isset($this->reaped_subprocesses[$name])) { + if($stream->has_crashed()) { + $name = $this->streams_names[array_search($stream, $this->streams)]; + if(!isset($this->finished_streams[$name])) { $this->errors->write("Process $name has crashed", [ 'type' => 'crash', - 'process' => $process, + 'process' => $stream, ]); - $this->reaped_subprocesses[$name] = true; + $this->finished_streams[$name] = true; } - } else if ($process->errors->read()) { - $this->errors->write($process->errors->consume_bytes(), [ + } else if ($stream->errors->read()) { + $this->errors->write($stream->errors->consume_bytes(), [ 'type' => 'error', - 'process' => $process, - ...($process->errors->get_metadata() ?? []), + 'process' => $stream, + ...($stream->errors->get_metadata() ?? []), ]); } } @@ -913,7 +913,7 @@ class HttpStream extends Stream { private $child_contexts = []; private $skipped_requests = []; - static public function stream($requests) { + static public function factory($requests) { return fn () => new HttpStream($requests); } @@ -956,7 +956,7 @@ class XMLStream extends BufferStream { private $xml_processor; private $node_visitor_callback; - static public function stream($node_visitor_callback) { + static public function factory($node_visitor_callback) { return fn () => new Demultiplexer(fn () => new XMLStream($node_visitor_callback) ); @@ -1052,15 +1052,15 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { require __DIR__ . '/bootstrap.php'; -$process = new StreamChain( +$stream = new StreamChain( [ - HttpStream::stream([ + HttpStream::factory([ new Request('http://127.0.0.1:9864/export.wxr.zip'), // Bad request, will fail: new Request('http://127.0.0.1:9865'), ]), - 'zip' => ZipReader::stream(), - CallbackStream::stream(function ($data, $context, $process) { + 'zip' => ZipReader::factory(), + CallbackStream::factory(function ($data, $context) { if ($context['zip']['file_id'] !== 'export.wxr') { $context['zip']->skip_file('export.wxr'); return null; @@ -1068,7 +1068,7 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { print_r($context['zip']->get_zip_reader()->get_header()); return $data; }), - 'xml' => XMLStream::stream(function (WP_XML_Processor $processor) { + 'xml' => XMLStream::factory(function (WP_XML_Processor $processor) { if (is_wxr_content_node($processor)) { $text = $processor->get_modifiable_text(); $updated_text = 'Hey there, what\'s up?'; @@ -1077,7 +1077,7 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { } } }), - CallbackStream::stream(function ($data, $context, $process) { + CallbackStream::factory(function ($data, $context) { return strtoupper($data); }) ], @@ -1086,10 +1086,10 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { new FilePipe('php://stderr', 'w') ); -foreach($process as $k => $chunk) { +foreach($stream as $k => $chunk) { var_dump([ $k => $chunk, - 'zip file_id' => $process['zip']['file_id'] + 'zip file_id' => $stream['zip']['file_id'] ]); } From 04b2e4c3852adf1fdcb79e14bb247ff65f5b389a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 30 Jul 2024 14:54:17 +0200 Subject: [PATCH 53/72] Replace stream factory method with a static create() method --- pipes-unix.php | 61 +++++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index 30040e4..be94e8b 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -533,8 +533,8 @@ public function close() { class CallbackStream extends TransformerStream { private $callback; - static public function factory($callback) { - return fn () => new static($callback); + static public function create($callback) { + return new static($callback); } private function __construct($callback) { @@ -549,7 +549,7 @@ protected function transform($data, $tick_context) { } class Demultiplexer extends BufferStream { - private $stream_factory = []; + private $stream_create = []; private $subprocesses = []; private $killed_subprocesses = []; private $demux_queue = []; @@ -557,8 +557,8 @@ class Demultiplexer extends BufferStream { private $last_input_key; private $key; - public function __construct($stream_factory, $key = 'sequence') { - $this->stream_factory = $stream_factory; + public function __construct($stream_create, $key = 'sequence') { + $this->stream_create = $stream_create; $this->key = $key; parent::__construct(); } @@ -572,8 +572,8 @@ protected function write($next_chunk, $metadata, $tick_context) { $chunk_key = is_array($metadata) && !empty( $metadata[$this->key] ) ? $metadata[$this->key] : 'default'; $this->last_input_key = $chunk_key; if (!isset($this->subprocesses[$chunk_key])) { - $factory = $this->stream_factory; - $this->subprocesses[$chunk_key] = $factory(); + $create = $this->stream_create; + $this->subprocesses[$chunk_key] = $create(); } $subprocess = $this->subprocesses[$chunk_key]; @@ -628,13 +628,13 @@ public function skip_file($file_id) require __DIR__ . '/zip-stream-reader.php'; -class ZipReader extends BufferStream { +class ZipReaderStream extends BufferStream { private $reader; private $last_skipped_file = null; - static public function factory() { - return fn () => new Demultiplexer(fn() => new ZipReader()); + static public function create() { + return new Demultiplexer(fn() => new ZipReaderStream()); } protected function __construct() { @@ -688,28 +688,27 @@ class StreamChain extends Stream implements Iterator { private $execution_stack = []; private $tick_context = []; - public function __construct($streams_factories, $input=null, $output=null, $errors=null) { + public function __construct($streams, $input=null, $output=null, $errors=null) { parent::__construct($input, $output, $errors); $last_process = null; - $this->streams_names = array_keys($streams_factories); + $this->streams_names = array_keys($streams); foreach($this->streams_names as $k => $name) { $this->streams_names[$k] = $name . ''; } - $streams = array_values($streams_factories); - for($i = 0; $i < count($streams_factories); $i++) { - $factory = $streams[$i]; - $subprocess = $factory(); + $streams = array_values($streams); + for($i = 0; $i < count($streams); $i++) { + $stream = $streams[$i]; if(null !== $last_process) { - $subprocess->input = $last_process->output; + $stream->input = $last_process->output; } - $this->streams[$this->streams_names[$i]] = $subprocess; - $last_process = $subprocess; + $this->streams[$this->streams_names[$i]] = $stream; + $last_process = $stream; } $this->first_subprocess = $this->streams[$this->streams_names[0]]; - $this->last_subprocess = $this->streams[$this->streams_names[count($streams_factories) - 1]]; + $this->last_subprocess = $this->streams[$this->streams_names[count($streams) - 1]]; } /** @@ -913,8 +912,8 @@ class HttpStream extends Stream { private $child_contexts = []; private $skipped_requests = []; - static public function factory($requests) { - return fn () => new HttpStream($requests); + static public function create($requests) { + return new HttpStream($requests); } private function __construct( $requests ) { @@ -952,13 +951,13 @@ protected function do_tick($tick_context): bool } -class XMLStream extends BufferStream { +class XMLTransformStream extends BufferStream { private $xml_processor; private $node_visitor_callback; - static public function factory($node_visitor_callback) { - return fn () => new Demultiplexer(fn () => - new XMLStream($node_visitor_callback) + static public function create($node_visitor_callback) { + return new Demultiplexer(fn () => + new XMLTransformStream($node_visitor_callback) ); } @@ -1054,13 +1053,13 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { $stream = new StreamChain( [ - HttpStream::factory([ + HttpStream::create([ new Request('http://127.0.0.1:9864/export.wxr.zip'), // Bad request, will fail: new Request('http://127.0.0.1:9865'), ]), - 'zip' => ZipReader::factory(), - CallbackStream::factory(function ($data, $context) { + 'zip' => ZipReaderStream::create(), + CallbackStream::create(function ($data, $context) { if ($context['zip']['file_id'] !== 'export.wxr') { $context['zip']->skip_file('export.wxr'); return null; @@ -1068,7 +1067,7 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { print_r($context['zip']->get_zip_reader()->get_header()); return $data; }), - 'xml' => XMLStream::factory(function (WP_XML_Processor $processor) { + 'xml' => XMLTransformStream::create(function (WP_XML_Processor $processor) { if (is_wxr_content_node($processor)) { $text = $processor->get_modifiable_text(); $updated_text = 'Hey there, what\'s up?'; @@ -1077,7 +1076,7 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { } } }), - CallbackStream::factory(function ($data, $context) { + CallbackStream::create(function ($data, $context) { return strtoupper($data); }) ], From 8bc278ca58f25d27955721ddc6e64442f39e5832 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 30 Jul 2024 14:57:45 +0200 Subject: [PATCH 54/72] Rename some more process variables and methods to stream taxonomy --- pipes-unix.php | 88 +++++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/pipes-unix.php b/pipes-unix.php index be94e8b..163cc83 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -550,10 +550,10 @@ protected function transform($data, $tick_context) { class Demultiplexer extends BufferStream { private $stream_create = []; - private $subprocesses = []; - private $killed_subprocesses = []; + private $streams = []; + private $killed_streams = []; private $demux_queue = []; - private $last_subprocess; + private $last_stream; private $last_input_key; private $key; @@ -563,52 +563,52 @@ public function __construct($stream_create, $key = 'sequence') { parent::__construct(); } - public function get_subprocess() + public function get_substream() { - return $this->last_subprocess; + return $this->last_stream; } protected function write($next_chunk, $metadata, $tick_context) { $chunk_key = is_array($metadata) && !empty( $metadata[$this->key] ) ? $metadata[$this->key] : 'default'; $this->last_input_key = $chunk_key; - if (!isset($this->subprocesses[$chunk_key])) { + if (!isset($this->streams[$chunk_key])) { $create = $this->stream_create; - $this->subprocesses[$chunk_key] = $create(); + $this->streams[$chunk_key] = $create(); } - $subprocess = $this->subprocesses[$chunk_key]; - $subprocess->input->write($next_chunk, $metadata); - $this->last_subprocess = $subprocess; + $stream = $this->streams[$chunk_key]; + $stream->input->write($next_chunk, $metadata); + $this->last_stream = $stream; } protected function read(): bool { - $subprocess = $this->last_subprocess; - if(!$subprocess) { + $stream = $this->last_stream; + if(!$stream) { return false; } - if(!$subprocess->tick()) { + if(!$stream->tick()) { return false; } - if ($subprocess->output->read()) { - $output = $subprocess->output->consume_bytes(); + if ($stream->output->read()) { + $output = $stream->output->consume_bytes(); $chunk_metadata = array_merge( [$this->key => $this->last_input_key], - $subprocess->output->get_metadata() ?? [], + $stream->output->get_metadata() ?? [], ); $this->output->write($output, $chunk_metadata); return true; } - if (!$subprocess->is_alive()) { - if ($subprocess->has_crashed()) { + if (!$stream->is_alive()) { + if ($stream->has_crashed()) { $this->errors->write( "Subprocess $this->last_input_key has crashed", [ 'type' => 'crash', - 'process' => $subprocess, + 'stream' => $stream, ] ); } @@ -619,10 +619,10 @@ protected function read(): bool public function skip_file($file_id) { - if(!$this->last_subprocess) { + if(!$this->last_stream) { return false; } - return $this->last_subprocess->skip_file($file_id); + return $this->last_stream->skip_file($file_id); } } @@ -680,8 +680,8 @@ protected function read(): bool } class StreamChain extends Stream implements Iterator { - private $first_subprocess; - private $last_subprocess; + private $first_stream; + private $last_stream; private $streams = []; private $streams_names = []; private $finished_streams = []; @@ -691,7 +691,7 @@ class StreamChain extends Stream implements Iterator { public function __construct($streams, $input=null, $output=null, $errors=null) { parent::__construct($input, $output, $errors); - $last_process = null; + $last_stream = null; $this->streams_names = array_keys($streams); foreach($this->streams_names as $k => $name) { $this->streams_names[$k] = $name . ''; @@ -700,21 +700,21 @@ public function __construct($streams, $input=null, $output=null, $errors=null) { $streams = array_values($streams); for($i = 0; $i < count($streams); $i++) { $stream = $streams[$i]; - if(null !== $last_process) { - $stream->input = $last_process->output; + if(null !== $last_stream) { + $stream->input = $last_stream->output; } $this->streams[$this->streams_names[$i]] = $stream; - $last_process = $stream; + $last_stream = $stream; } - $this->first_subprocess = $this->streams[$this->streams_names[0]]; - $this->last_subprocess = $this->streams[$this->streams_names[count($streams) - 1]]; + $this->first_stream = $this->streams[$this->streams_names[0]]; + $this->last_stream = $this->streams[$this->streams_names[count($streams) - 1]]; } /** * ## Process chain tick * - * Pushes data through a chain of subprocesses. Every downstream data chunk + * Pushes data through a chain of streams. Every downstream data chunk * is fully processed before asking for more chunks upstream. * * For example, suppose we: @@ -736,7 +736,7 @@ public function __construct($streams, $input=null, $output=null, $errors=null) { * metadata and exposes methods like skip_file(). */ protected function do_tick($tick_context): bool { - if($this->last_subprocess->output->is_eof()) { + if($this->last_stream->output->is_eof()) { $this->finish(); return false; } @@ -745,18 +745,18 @@ protected function do_tick($tick_context): bool { if(true !== $this->input->read()) { break; } - $this->first_subprocess->input->write( + $this->first_stream->input->write( $this->input->consume_bytes(), $this->input->get_metadata() ); } if($this->input->is_eof()) { - $this->first_subprocess->input->close(); + $this->first_stream->input->close(); } if(empty($this->execution_stack)) { - array_push($this->execution_stack, $this->first_subprocess); + array_push($this->execution_stack, $this->first_stream); } while (count($this->execution_stack)) { @@ -771,25 +771,25 @@ protected function do_tick($tick_context): bool { continue; } - // We've got output from the process, yay! Let's + // We've got output from the stream, yay! Let's // propagate it downstream. $this->push_stream($stream); for ($i = count($this->execution_stack); $i < count($this->streams_names); $i++) { - $next_process = $this->streams[$this->streams_names[$i]]; - if (true !== $this->tick_stream($next_process)) { + $next_stream = $this->streams[$this->streams_names[$i]]; + if (true !== $this->tick_stream($next_stream)) { break; } - $this->push_stream($next_process); + $this->push_stream($next_stream); } // When the last process in the chain produces output, // we write it to the output pipe and bale. - if(true !== $this->last_subprocess->output->read()) { + if(true !== $this->last_stream->output->read()) { break; } $this->output->write( - $this->last_subprocess->output->consume_bytes(), + $this->last_stream->output->consume_bytes(), $this->tick_context ); ++$this->chunk_nb; @@ -798,7 +798,7 @@ protected function do_tick($tick_context): bool { // We produced no output and the upstream pipe is EOF. // We're done. - if(!$this->first_subprocess->is_alive()) { + if(!$this->first_stream->is_alive()) { $this->finish(); } @@ -817,7 +817,7 @@ private function push_stream($stream) array_push($this->execution_stack, $stream); $name = $this->streams_names[count($this->execution_stack) - 1]; if($stream instanceof Demultiplexer) { - $stream = $stream->get_subprocess(); + $stream = $stream->get_substream(); } $this->tick_context[$name] = $stream; } @@ -836,14 +836,14 @@ private function handle_errors($stream) if(!isset($this->finished_streams[$name])) { $this->errors->write("Process $name has crashed", [ 'type' => 'crash', - 'process' => $stream, + 'stream' => $stream, ]); $this->finished_streams[$name] = true; } } else if ($stream->errors->read()) { $this->errors->write($stream->errors->consume_bytes(), [ 'type' => 'error', - 'process' => $stream, + 'stream' => $stream, ...($stream->errors->get_metadata() ?? []), ]); } From 96491030a97db85d5641f395a69979526d6fd8d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 30 Jul 2024 17:56:18 +0200 Subject: [PATCH 55/72] Try a vastly simplified approach in pipes-unified.php --- IStreamProcessor.php | 8 + bootstrap.php | 1 + class-wp-xml-processor.php | 6 +- class-wp-xml-tag-processor.php | 17 +- pipes-unified.php | 671 +++++++++++++++++++++++++++++++++ pipes-unix.php | 275 ++++++-------- pipes.php | 2 +- zip-stream-reader.php | 20 +- 8 files changed, 829 insertions(+), 171 deletions(-) create mode 100644 IStreamProcessor.php create mode 100644 pipes-unified.php diff --git a/IStreamProcessor.php b/IStreamProcessor.php new file mode 100644 index 0000000..895edfe --- /dev/null +++ b/IStreamProcessor.php @@ -0,0 +1,8 @@ +next_token(); $processor->get_updated_xml(); - if ( $processor->paused_at_incomplete_token() ) { + if ( $processor->is_paused_at_incomplete_input() ) { fwrite( $output_stream, $processor->get_processed_xml() ); $next_chunk = fread( $input_stream, $buffer_size ); @@ -94,7 +94,7 @@ public static function stream_tokens( $input_stream, $output_stream, $buffer_siz * * @param string $next_chunk XML to append. */ - public function stream_append_xml( $next_chunk ) + public function append_bytes( string $next_chunk ) { $this->get_updated_xml(); diff --git a/class-wp-xml-tag-processor.php b/class-wp-xml-tag-processor.php index 45ea8f4..5cae887 100644 --- a/class-wp-xml-tag-processor.php +++ b/class-wp-xml-tag-processor.php @@ -870,16 +870,27 @@ protected function base_class_next_token() { * * $processor = new WP_XML_Tag_Processor( 'should_iterate_errors && $this->get_last_error()) { + return $this->get_last_error(); + } + return $this->get_bytes(); + } + + private $chunk_nb = -1; + public function key(): mixed { + return $this->chunk_nb; + } + + public function rewind(): void { + $this->next(); + } + + private $should_iterate_errors = false; + public function iterate_errors($should_iterate_errors) + { + $this->should_iterate_errors = $should_iterate_errors; + } + + public function next(): void { + ++$this->chunk_nb; + while(!$this->next_chunk()) { + if($this->should_iterate_errors && $this->get_last_error()) { + break; + } + if($this->is_output_eof()) { + break; + } + usleep(10000); + } + } + + public function valid(): bool { + return !$this->is_output_eof() || ($this->should_iterate_errors && $this->get_last_error()); + } + + + // ArrayAccess on ProcessChain exposes specific + // sub-processes by their names. + public function offsetExists($offset): bool { + return isset($this->tick_context[$offset]); + } + + public function offsetGet($offset): mixed { + return $this->tick_context[$offset] ?? null; + } + + public function offsetSet($offset, $value): void { + // No op + } + + public function offsetUnset($offset): void { + // No op + } + +} + + +class HttpStream extends ByteStream { + private $client; + private $requests = []; + private $child_contexts = []; + private $skipped_requests = []; + private $file_id; + private $request; + + static public function create($requests) { + return new HttpStream($requests); + } + + private function __construct( $requests ) { + $this->client = new Client(); + $this->client->enqueue( $requests ); + } + + public function get_file_id(): string|null + { + return $this->request ? 'request_' . $this->request->id : null; + } + + protected function tick(): bool + { + $this->request = null; + while($this->client->await_next_event()) { + $this->request = $this->client->get_request(); + switch ($this->client->get_event()) { + case Client::EVENT_BODY_CHUNK_AVAILABLE: + $this->set_output_bytes($this->client->get_response_body_chunk()); + return true; + + case Client::EVENT_FAILED: + $this->set_last_error('Request failed: ' . $this->request->error); + break; + } + } + + $this->finish(); + return false; + } + +} + + +function is_wxr_content_node( WP_XML_Processor $processor ) { + if ( ! in_array( 'item', $processor->get_breadcrumbs() ) ) { + return false; + } + if ( + ! in_array( 'excerpt:encoded', $processor->get_breadcrumbs() ) + && ! in_array( 'content:encoded', $processor->get_breadcrumbs() ) + && ! in_array( 'wp:attachment_url', $processor->get_breadcrumbs() ) + && ! in_array( 'guid', $processor->get_breadcrumbs() ) + && ! in_array( 'link', $processor->get_breadcrumbs() ) + && ! in_array( 'wp:comment_content', $processor->get_breadcrumbs() ) + // Meta values are not suppoerted yet. We'll need to support + // WordPress core options that may be saved as JSON, PHP Deserialization, and XML, + // and then provide extension points for plugins authors support + // their own options. + // !in_array('wp:postmeta', $processor->get_breadcrumbs()) + ) { + return false; + } + + switch ( $processor->get_token_type() ) { + case '#text': + case '#cdata-section': + return true; + } + + return false; +}; + + +$chain = new StreamChain( + [ + 'http' => HttpStream::create([ + new Request('http://127.0.0.1:9864/export.wxr.zip'), + // Bad request, will fail: + new Request('http://127.0.0.1:9865'), + ]), + 'zip' => ZipReaderStream::create(), + CallbackStream::create(function ($data, $context) { + if ($context['zip']->get_file_id() !== 'export.wxr') { + $context['zip']->skip_file(); + return null; + } + // Print detailed information from the ZIP processor + // print_r($context['zip']->get_processor()->get_header()); + return $data; + }), + XMLTransformStream::create(function (WP_XML_Processor $processor) { + if (is_wxr_content_node($processor)) { + $text = $processor->get_modifiable_text(); + $updated_text = 'Hey there, what\'s up?'; + if ($updated_text !== $text) { + // @TODO: Fix stream updating XML + // $processor->set_modifiable_text($updated_text); + } + } + }), + CallbackStream::create(function ($data, $context) { + return strtoupper($data); + }) + ] +); + +// Consume the data like this: +// var_dump([$chain->next_chunk(), strlen($chain->get_bytes()), $chain->get_last_error()]); + +// Or like this: +// $chain->iterate_errors(true); +foreach($chain as $k => $chunk_or_error) { + $chunk_or_error && var_dump([ + $k => $chunk_or_error, + 'is_error' => !!$chain->get_last_error(), + 'zip file_id' => isset($chain['zip']) ? $chain['zip']->get_file_id() : null + ]); +} + diff --git a/pipes-unix.php b/pipes-unix.php index 163cc83..832341b 100644 --- a/pipes-unix.php +++ b/pipes-unix.php @@ -3,7 +3,42 @@ /** * @TODO: * - * * Find a naming scheme that doesn't suggest we're working with actual Unix processes and pipes. + * * ✅ Consider an interface for all streamable processor classes to make them chainable. + * ^ Here's some reasons not to do that: + * + * 1. The same processor may need multiple stream implementations. For example, + * an XML processor may be used to transform the document, so go from XML bytes to XML bytes, + * but it can also be used to extract HTML from CDATA, so go from XML bytes to HTML bytes. + * 2. Not all processor must output bytes, and for some processors, the metadata may be much more + * important than the actual output bytes. + * 3. Exposing a method like Processor::chain($processor_2) means we can call it multiple times, + * which means we either need to handle forking the stream or track the chaining state and + * handle errors in all ambiguous cases. + * 4. A method like Processor::next() would be ambiguous: + * * For HTML and XML processors it could mean "next token", "next tag", or "next bytes chunk" + * * For a ZIP processor it could mean "next file", "next ZIP entry", or "next file chunk" + * * ...etc... + * A set of processor-specific methods, such as next_token(), next_tag(), next_file() etc. seems + * like a more intuitive choice. + * + * However, it would still be useful to have *some* common interface there. Perhaps this could work: + * + * interface StreamProcessor { + * public function append_bytes($bytes): bool; + * public function is_finished(): bool; + * public function is_paused_at_incomplete_input(): bool; + * } + * + * I am on the fence about adding the following method: + * + * interface StreamProcessor { + * public function get_last_error(): ?string; + * } + * + * Keeping Processors separate from Stream implementations seems useful. This way we don't have to + * worry about stdin/stdout/stderr etc. and can focus on actual processing. The stream will figure + * out how to use the processor semantics to transform byte chunks. + * * ✅ Find a naming scheme that doesn't suggest we're working with actual Unix processes and pipes. * I only used it to make the development easier, I got confused with the other attempt in * `pipes.php` and this kept me on track. However, keeping these names will likely confuse others. * * ✅ Explore merging Pipes and Processes into a single concept after all. @@ -153,6 +188,8 @@ * for is_eof() on errors, but we still have to close that errors pipe. */ + require __DIR__ . '/bootstrap.php'; + use WordPress\AsyncHttp\Client; use WordPress\AsyncHttp\Request; @@ -260,6 +297,56 @@ abstract protected function write($input_chunk, $metadata, $tick_context); abstract protected function read(): bool; } + +abstract class ProcessorStream extends Stream +{ + + protected IStreamProcessor $processor; + + public function __construct($input = null, $output = null, $errors = null) + { + parent::__construct($input, $output, $errors); + $this->processor = $this->create_processor(); + } + + public function get_processor() + { + return $this->processor; + } + + abstract protected function create_processor(): IStreamProcessor; + + protected function do_tick($tick_context): bool + { + if(true === $this->next()) { + return true; + } + + if (!$this->input->read()) { + if ($this->input->is_eof()) { + $this->finish(); + } + return false; + } + + $this->processor->append_bytes($this->input->consume_bytes()); + + if($this->processor->is_paused_at_incomplete_input()) { + return false; + } + + if ($this->processor->get_last_error()) { + $this->crash($this->processor->get_last_error()); + return false; + } + + return $this->next(); + } + + abstract protected function next(): bool; + +} + abstract class TransformerStream extends BufferStream { protected $buffer; @@ -424,112 +511,6 @@ public function __construct($filename, $mode) { } } -/** - * This isn't used anymore. Yay! It could be just removed, - * but it looks useful so let's keep it around for a while. - */ -class MultiplexingPipe implements Pipe { - private $used = false; - private array $sequences = []; - private ?string $last_read_sequence = 'default'; - - public function __construct(array $pipes = []) - { - $this->sequences = $pipes; - } - - public function read(): ?bool { - if (empty($this->sequences)) { - return false; - } - - $sequences_to_check = $this->next_sequences(); - foreach($sequences_to_check as $sequence_name) { - if(!$this->sequences[$sequence_name]->read()) { - continue; - } - $this->last_read_sequence = $sequence_name; - return true; - } - - return null; - } - - public function consume_bytes() - { - if(!$this->last_read_sequence || !isset($this->sequences[$this->last_read_sequence])) { - return null; - } - return $this->sequences[$this->last_read_sequence]->consume_bytes(); - } - - public function get_metadata() { - if(!$this->last_read_sequence || !isset($this->sequences[$this->last_read_sequence])) { - return null; - } - return $this->sequences[$this->last_read_sequence]->get_metadata(); - } - - private function next_sequences() { - $sequences_queue = []; - $sequence_names = array_keys($this->sequences); - $last_read_sequence_index = array_search($this->last_read_sequence, $sequence_names); - if(false === $last_read_sequence_index) { - $last_read_sequence_index = 0; - } else if($last_read_sequence_index > count($sequence_names)) { - $last_read_sequence_index = count($sequence_names) - 1; - } - - $this->last_read_sequence = null; - for ($i = 1; $i <= count($sequence_names); $i++) { - $key_index = ($last_read_sequence_index + $i) % count($sequence_names); - $sequence_name = $sequence_names[$key_index]; - if($this->sequences[$sequence_name]->is_eof()) { - unset($this->sequences[$sequence_name]); - continue; - } - $this->last_read_sequence = $sequence_name; - $sequences_queue[] = $sequence_name; - } - return $sequences_queue; - } - - public function write(string $data, $metadata = null): bool { - $this->used = true; - $current_sequence = 'default'; - - if(is_array($metadata) && isset($metadata['sequence'])) { - $current_sequence = $metadata['sequence']; - } - - if (!isset($this->sequences[$current_sequence])) { - $this->sequences[$current_sequence] = new BufferPipe(); - } - - $this->last_read_sequence = $current_sequence; - return $this->sequences[$current_sequence]->write($data, $metadata); - } - - public function is_eof(): bool { - if(!$this->used) { - return false; - } - foreach ($this->sequences as $pipe) { - if (!$pipe->is_eof()) { - return false; - } - } - return true; - } - - public function close() { - $this->used = true; - foreach ($this->sequences as $pipe) { - $pipe->close(); - } - } -} - class CallbackStream extends TransformerStream { private $callback; @@ -628,23 +609,21 @@ public function skip_file($file_id) require __DIR__ . '/zip-stream-reader.php'; -class ZipReaderStream extends BufferStream { +class ZipReaderStream extends ProcessorStream { - private $reader; + /** + * @var ZipStreamReader + */ + protected IStreamProcessor $processor; private $last_skipped_file = null; static public function create() { return new Demultiplexer(fn() => new ZipReaderStream()); } - protected function __construct() { - parent::__construct(); - $this->reader = new ZipStreamReader(''); - } - - public function get_zip_reader() + protected function create_processor(): IStreamProcessor { - return $this->reader; + return new ZipStreamReader(''); } public function skip_file($file_id) @@ -652,20 +631,16 @@ public function skip_file($file_id) $this->last_skipped_file = $file_id; } - protected function write($bytes, $metadata, $tick_context) { - $this->reader->append_bytes($bytes); - } - - protected function read(): bool + protected function next(): bool { - while ($this->reader->next()) { - switch ($this->reader->get_state()) { + while ($this->processor->next()) { + switch ($this->processor->get_state()) { case ZipStreamReader::STATE_FILE_ENTRY: - $file_path = $this->reader->get_file_path(); + $file_path = $this->processor->get_file_path(); if ($this->last_skipped_file === $file_path) { break; } - $this->output->write($this->reader->get_file_body_chunk(), [ + $this->output->write($this->processor->get_file_body_chunk(), [ 'file_id' => $file_path, // Use a separate sequence for each file so the next // process may separate the files. @@ -767,7 +742,7 @@ protected function do_tick($tick_context): bool { continue; } - if(true !== $this->tick_stream($stream)) { + if(true !== $this->stream_next($stream)) { continue; } @@ -777,7 +752,7 @@ protected function do_tick($tick_context): bool { for ($i = count($this->execution_stack); $i < count($this->streams_names); $i++) { $next_stream = $this->streams[$this->streams_names[$i]]; - if (true !== $this->tick_stream($next_stream)) { + if (true !== $this->stream_next($next_stream)) { break; } $this->push_stream($next_stream); @@ -822,7 +797,7 @@ private function push_stream($stream) $this->tick_context[$name] = $stream; } - private function tick_stream($stream) + private function stream_next($stream) { $produced_output = $stream->tick($this->tick_context); $this->handle_errors($stream); @@ -951,9 +926,12 @@ protected function do_tick($tick_context): bool } -class XMLTransformStream extends BufferStream { - private $xml_processor; +class XMLTransformStream extends ProcessorStream { private $node_visitor_callback; + /** + * @var WP_XML_Processor + */ + protected IStreamProcessor $processor; static public function create($node_visitor_callback) { return new Demultiplexer(fn () => @@ -962,49 +940,34 @@ static public function create($node_visitor_callback) { } private function __construct( $node_visitor_callback ) { - $this->xml_processor = new WP_XML_Processor( '', [], WP_XML_Processor::IN_PROLOG_CONTEXT ); $this->node_visitor_callback = $node_visitor_callback; parent::__construct(); } - public function get_xml_processor() - { - return $this->xml_processor; - } - - protected function write($bytes, $metadata, $tick_context) + protected function create_processor(): IStreamProcessor { - $this->xml_processor->stream_append_xml($bytes); + return new WP_XML_Processor( '', [], WP_XML_Processor::IN_PROLOG_CONTEXT ); } - protected function read(): bool + protected function next(): bool { - if($this->xml_processor->paused_at_incomplete_token()) { - return false; - } - - if ( $this->xml_processor->get_last_error() ) { - $this->crash( $this->xml_processor->get_last_error() ); - return false; - } - $tokens_found = 0; - while ( $this->xml_processor->next_token() ) { + while ( $this->processor->next_token() ) { ++ $tokens_found; $node_visitor_callback = $this->node_visitor_callback; - $node_visitor_callback( $this->xml_processor ); + $node_visitor_callback( $this->processor ); } $buffer = ''; if ( $tokens_found > 0 ) { - $buffer .= $this->xml_processor->get_updated_xml(); + $buffer .= $this->processor->get_updated_xml(); } else if ( $tokens_found === 0 && - ! $this->xml_processor->paused_at_incomplete_token() && - $this->xml_processor->get_current_depth() === 0 + ! $this->processor->is_paused_at_incomplete_input() && + $this->processor->get_current_depth() === 0 ) { // We've reached the end of the document, let's finish up. - $buffer .= $this->xml_processor->get_unprocessed_xml(); + $buffer .= $this->processor->get_unprocessed_xml(); $this->finish(); } @@ -1048,8 +1011,6 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { return false; }; -require __DIR__ . '/bootstrap.php'; - $stream = new StreamChain( [ @@ -1064,7 +1025,7 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { $context['zip']->skip_file('export.wxr'); return null; } - print_r($context['zip']->get_zip_reader()->get_header()); + print_r($context['zip']->get_processor()->get_header()); return $data; }), 'xml' => XMLTransformStream::create(function (WP_XML_Processor $processor) { diff --git a/pipes.php b/pipes.php index 423fd4a..f4c25af 100644 --- a/pipes.php +++ b/pipes.php @@ -229,7 +229,7 @@ public function __construct( $node_visitor_callback ) { } protected function doWrite( string $data, ?StreamedFileContext $context=null ): bool { - $this->xml_processor->stream_append_xml( $data ); + $this->xml_processor->append_bytes( $data ); return true; } diff --git a/zip-stream-reader.php b/zip-stream-reader.php index 893d088..60e6826 100644 --- a/zip-stream-reader.php +++ b/zip-stream-reader.php @@ -2,7 +2,7 @@ define('RUN_ZIP_SMOKE_TEST', false); -class ZipStreamReader { +class ZipStreamReader implements IStreamProcessor { const SIGNATURE_FILE = 0x04034b50; const SIGNATURE_CENTRAL_DIRECTORY = 0x02014b50; @@ -32,16 +32,22 @@ public function __construct($bytes='') { $this->zip = $bytes; } - public function append_bytes($bytes) + public function append_bytes(string $bytes) { $this->zip = substr($this->zip, $this->bytes_parsed_so_far) . $bytes; - $this->bytes_parsed_so_far = 0; + $this->bytes_parsed_so_far = 0; + $this->paused_incomplete_input = false; } - public function paused_at_incomplete_token() { + public function is_paused_at_incomplete_input(): bool { return $this->paused_incomplete_input; } + public function is_finished(): bool + { + return self::STATE_COMPLETE === $this->state || self::STATE_ERROR === $this->state; + } + public function get_state() { return $this->state; @@ -66,7 +72,7 @@ public function get_file_body_chunk() return $this->file_body_chunk; } - public function get_error_message() + public function get_last_error(): ?string { return $this->error_message; } @@ -337,14 +343,14 @@ private function consume_bytes($n) { } echo "\n"; } - if ($reader->paused_at_incomplete_token()) { + if ($reader->is_paused_at_incomplete_input()) { if (feof($fp)) { break; } $reader->append_bytes(fread($fp, 1024)); } if (ZipStreamReader::STATE_ERROR === $reader->get_state()) { - echo 'Error: ' . $reader->get_error_message() . "\n"; + echo 'Error: ' . $reader->get_last_error() . "\n"; break; } } From 2fed566e5b3a97c9716c808deb6ea1ac0651270a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 30 Jul 2024 17:58:05 +0200 Subject: [PATCH 56/72] Rename next_chunk() to next_bytes() --- pipes-unified.php | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pipes-unified.php b/pipes-unified.php index 4aa6619..f1e3ec0 100644 --- a/pipes-unified.php +++ b/pipes-unified.php @@ -18,7 +18,7 @@ interface IByteStream { const STATE_STREAMING = '#streaming'; const STATE_FINISHED = '#finished'; - public function next_chunk(): bool; + public function next_bytes(): bool; public function input_eof(); public function append_bytes(string $bytes, $context = null); public function is_output_eof(): bool; @@ -61,7 +61,7 @@ protected function consume_input_bytes() { return $bytes; } - public function next_chunk(): bool + public function next_bytes(): bool { $this->output_bytes = null; $this->last_error = null; @@ -273,7 +273,7 @@ protected function tick(): bool return false; } - if($stream->next_chunk()) { + if($stream->next_bytes()) { $this->set_output_bytes($stream->get_bytes()); return true; } @@ -470,7 +470,7 @@ private function push_stream(IByteStream $stream) private function stream_next(IByteStream $stream) { - $produced_output = $stream->next_chunk(); + $produced_output = $stream->next_bytes(); $this->handle_errors($stream); return $produced_output; } @@ -511,7 +511,7 @@ public function iterate_errors($should_iterate_errors) public function next(): void { ++$this->chunk_nb; - while(!$this->next_chunk()) { + while(!$this->next_bytes()) { if($this->should_iterate_errors && $this->get_last_error()) { break; } From c83c104ad2d6d25c929e180447406ecc0f78a6f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 30 Jul 2024 17:59:15 +0200 Subject: [PATCH 57/72] Add a get_bytes() interface method --- pipes-unified.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipes-unified.php b/pipes-unified.php index f1e3ec0..875455f 100644 --- a/pipes-unified.php +++ b/pipes-unified.php @@ -22,6 +22,7 @@ public function next_bytes(): bool; public function input_eof(); public function append_bytes(string $bytes, $context = null); public function is_output_eof(): bool; + public function get_bytes(): ?string; public function get_last_error(): ?string; public function get_file_id(): ?string; } @@ -94,7 +95,7 @@ protected function finish() $this->state = IByteStream::STATE_FINISHED; } - public function get_bytes() + public function get_bytes(): ?string { return $this->output_bytes; } From b4eced3636863f7733f2652267a7aae62f948846 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 30 Jul 2024 18:08:06 +0200 Subject: [PATCH 58/72] Separate IFilesStream --- pipes-unified.php | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/pipes-unified.php b/pipes-unified.php index 875455f..d76655b 100644 --- a/pipes-unified.php +++ b/pipes-unified.php @@ -24,7 +24,11 @@ public function append_bytes(string $bytes, $context = null); public function is_output_eof(): bool; public function get_bytes(): ?string; public function get_last_error(): ?string; +} + +interface IFilesStream extends IByteStream { public function get_file_id(): ?string; + public function skip_file(): void; } abstract class ByteStream implements IByteStream { @@ -33,6 +37,7 @@ abstract class ByteStream implements IByteStream { private ?string $output_bytes = null; protected string $state = IByteStream::STATE_STREAMING; private ?string $last_error = null; + private $last_skipped_file = null; protected $input_context = null; public function append_bytes(string $bytes, $context = null) { @@ -56,6 +61,11 @@ protected function set_last_error($error) { $this->last_error = $error; } + public function skip_file() + { + $this->last_skipped_file = $this->get_file_id(); + } + protected function consume_input_bytes() { $bytes = $this->input_bytes; $this->input_bytes = null; @@ -74,18 +84,19 @@ public function next_bytes(): bool return false; } - if(true === $this->tick()) { - return true; + // Process any remaining buffered input: + if($this->tick()) { + return $this->get_file_id() !== $this->last_skipped_file; } if (!$this->input_bytes) { - if($this->input_eof) { + if ($this->input_eof) { $this->finish(); } return false; } - return $this->tick(); + return $this->tick() && $this->get_file_id() !== $this->last_skipped_file; } abstract protected function tick(): bool; @@ -118,7 +129,6 @@ class ZipReaderStream extends ByteStream { * @var ZipStreamReader */ protected IStreamProcessor $processor; - private $last_skipped_file = null; private $file_id; static public function create() { @@ -130,11 +140,6 @@ public function __construct() $this->processor = new ZipStreamReader(''); } - public function skip_file() - { - $this->last_skipped_file = $this->file_id; - } - public function get_file_id(): string|null { return $this->file_id; @@ -157,9 +162,6 @@ protected function tick(): bool switch ($this->processor->get_state()) { case ZipStreamReader::STATE_FILE_ENTRY: $file_path = $this->processor->get_file_path(); - if ($this->last_skipped_file === $file_path) { - break; - } $this->file_id = $file_path; $this->set_output_bytes($this->processor->get_file_body_chunk()); return true; @@ -663,7 +665,7 @@ function is_wxr_content_node( WP_XML_Processor $processor ) { // Or like this: // $chain->iterate_errors(true); foreach($chain as $k => $chunk_or_error) { - $chunk_or_error && var_dump([ + var_dump([ $k => $chunk_or_error, 'is_error' => !!$chain->get_last_error(), 'zip file_id' => isset($chain['zip']) ? $chain['zip']->get_file_id() : null From c07da9b134330bc2de660da3d40e6809704f0689 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 30 Jul 2024 18:12:25 +0200 Subject: [PATCH 59/72] Group IByteSteram methods --- pipes-unified.php | 6 ++++-- zip-stream-reader.php | 4 ++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pipes-unified.php b/pipes-unified.php index d76655b..c41ee20 100644 --- a/pipes-unified.php +++ b/pipes-unified.php @@ -18,11 +18,13 @@ interface IByteStream { const STATE_STREAMING = '#streaming'; const STATE_FINISHED = '#finished'; - public function next_bytes(): bool; public function input_eof(); - public function append_bytes(string $bytes, $context = null); public function is_output_eof(): bool; + + public function next_bytes(): bool; + public function append_bytes(string $bytes, $context = null); public function get_bytes(): ?string; + public function get_last_error(): ?string; } diff --git a/zip-stream-reader.php b/zip-stream-reader.php index 60e6826..229ea5d 100644 --- a/zip-stream-reader.php +++ b/zip-stream-reader.php @@ -28,6 +28,10 @@ class ZipStreamReader implements IStreamProcessor { const STATE_COMPLETE = 'complete'; const STATE_ERROR = 'error'; + static public function stream() { + return new Demultiplexer(fn() => new ZipReaderStream()); + } + public function __construct($bytes='') { $this->zip = $bytes; } From 20f78eddb7e39115e199b935c57564be1f81a694 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 31 Jul 2024 01:24:58 +0200 Subject: [PATCH 60/72] Explore an approach based on creating a Stream class and passing a handler callback to it --- pipes-controller.php | 627 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 627 insertions(+) create mode 100644 pipes-controller.php diff --git a/pipes-controller.php b/pipes-controller.php new file mode 100644 index 0000000..d00ff30 --- /dev/null +++ b/pipes-controller.php @@ -0,0 +1,627 @@ +consume_input_bytes(); + if(!$bytes) { + return false; + } + $output = $callback($bytes, $controller->input_context); + if(null === $output) { + return false; + } + $controller->output_bytes = $output; + return true; + }); + } + + public function __construct($next_bytes_callback) { + $this->next_bytes_callback = $next_bytes_callback; + $this->controller = new ByteStreamController(); + } + + public function get_file_id() + { + return $this->controller->get_file_id(); + } + + public function skip_file() + { + $this->controller->skip_file(); + } + + public function append_bytes(string $bytes, $context = null) { + $this->controller->append_bytes($bytes, $context); + } + + public function get_bytes() + { + return $this->controller->output_bytes; + } + + public function get_last_error(): string|null + { + return $this->controller->get_last_error(); + } + + public function next_bytes() + { + $this->controller->output_bytes = null; + $this->controller->last_error = null; + if($this->controller->is_output_eof()) { + return false; + } + + // Process any remaining buffered input: + $calback = $this->next_bytes_callback; + if($calback($this->controller)) { + return ! $this->controller->is_skipped_file(); + } + + if (!$this->controller->input_bytes) { + if ($this->controller->input_eof) { + $this->finish(); + } + return false; + } + + return $calback($this->controller) && ! $this->controller->is_skipped_file(); + } + +} + +class ProcessorByteStream extends Byte_Stream +{ + public $processor; + + public function __construct($processor, $callback) + { + $this->processor = $processor; + parent::__construct($callback); + } +} + +/** + * This interface describes standalone streams, but it can also be + * used to describe a stream Processor like WP_XML_Processor. + * + * In this prototype there are no pipes, streams, and processors. There + * are only Byte Streams that can be chained together with the StreamChain + * class. + */ +class ByteStreamController { + const STATE_STREAMING = '#streaming'; + const STATE_FINISHED = '#finished'; + + public bool $input_eof = false; + public ?string $input_bytes = null; + public ?string $output_bytes = null; + public string $state = self::STATE_STREAMING; + public ?string $last_error = null; + public $input_context = null; + + public $file_id; + public $last_skipped_file; + + public function append_bytes(string $bytes, $context = null) { + $this->input_bytes .= $bytes; + $this->input_context = $context; + } + + public function input_eof() { + $this->input_eof = true; + } + + public function is_output_eof(): bool { + return !$this->output_bytes && $this->state === self::STATE_FINISHED; + } + + public function get_last_error(): ?string { + return $this->last_error; + } + + public function set_last_error($error) { + $this->last_error = $error; + } + + public function consume_input_bytes() { + $bytes = $this->input_bytes; + $this->input_bytes = null; + return $bytes; + } + + public function get_file_id() + { + return $this->file_id ?? 'default'; + } + + public function is_skipped_file() + { + return $this->get_file_id() === $this->last_skipped_file; + } + + public function finish() + { + $this->state = self::STATE_FINISHED; + } + + public function skip_file(): void { + $this->last_skipped_file = $this->file_id; + } +} + +function is_wxr_content_node( WP_XML_Processor $processor ) { + if ( ! in_array( 'item', $processor->get_breadcrumbs() ) ) { + return false; + } + if ( + ! in_array( 'excerpt:encoded', $processor->get_breadcrumbs() ) + && ! in_array( 'content:encoded', $processor->get_breadcrumbs() ) + && ! in_array( 'wp:attachment_url', $processor->get_breadcrumbs() ) + && ! in_array( 'guid', $processor->get_breadcrumbs() ) + && ! in_array( 'link', $processor->get_breadcrumbs() ) + && ! in_array( 'wp:comment_content', $processor->get_breadcrumbs() ) + // Meta values are not suppoerted yet. We'll need to support + // WordPress core options that may be saved as JSON, PHP Deserialization, and XML, + // and then provide extension points for plugins authors support + // their own options. + // !in_array('wp:postmeta', $processor->get_breadcrumbs()) + ) { + return false; + } + + switch ( $processor->get_token_type() ) { + case '#text': + case '#cdata-section': + return true; + } + + return false; +}; + +class Demultiplexer extends Byte_Stream { + private $stream_factory = []; + private $streams = []; + private $last_stream; + private $last_input_key; + private $key; + + public function __construct($stream_factory) { + $this->stream_factory = $stream_factory; + parent::__construct([$this, 'tick']); + } + + public function get_substream() + { + return $this->last_stream; + } + + protected function tick(): bool + { + $stream = $this->last_stream; + if (!$stream) { + return false; + } + + if($stream->next_bytes()) { + $this->controller->file_id = $stream->controller->get_file_id(); + $this->controller->output_bytes = $stream->get_bytes(); + return true; + } + + if($stream->get_last_error()) { + $this->controller->set_last_error($stream->get_last_error()); + } + return false; + } + + public function append_bytes(string $data, $context = null): bool { + $chunk_key = 'default'; + if($context) { + $chunk_key = []; + foreach($context as $k=>$stream) { + $chunk_key[] = $stream->controller->get_file_id(); + } + $chunk_key = implode(':', $chunk_key); + } + + $this->last_input_key = $chunk_key; + if (!isset($this->streams[$chunk_key])) { + $create = $this->stream_factory; + $this->streams[$chunk_key] = $create(); + } + $stream = $this->streams[$chunk_key]; + $stream->controller->append_bytes($data, $context); + $this->last_stream = $stream; + return true; + } + + protected function finish() + { + $this->controller->finish(); + foreach($this->streams as $stream) { + $stream->controller->finish(); + } + } +} + +class StreamChain extends Byte_Stream implements ArrayAccess, Iterator { + private $first_stream; + private $last_stream; + /** + * @var Byte_Stream[] + */ + private $streams = []; + private $streams_names = []; + private $execution_stack = []; + private $tick_context = []; + + public function __construct($streams) { + $named_streams = []; + foreach($streams as $name => $stream) { + $string_name = is_numeric($name) ? 'stream_' . $name : $name; + $named_streams[$string_name] = $streams[$name]; + } + + $this->streams = $named_streams; + $this->streams_names = array_keys($this->streams); + $this->first_stream = $this->streams[$this->streams_names[0]]; + $this->last_stream = $this->streams[$this->streams_names[count($streams) - 1]]; + parent::__construct([$this, 'tick']); + } + + /** + * ## Process chain tick + * + * Pushes data through a chain of streams. Every downstream data chunk + * is fully processed before asking for more chunks upstream. + * + * For example, suppose we: + * + * * Send 3 HTTP requests, and each of them produces a ZIP file + * * Each ZIP file has 3 XML files inside + * * Each XML file is rewritten using the XML_Processor + * + * Once the HTTP client has produced the first ZIP file, we start processing it. + * The ZIP decoder may already have enough data to unzip three files, but we only + * produce the first chunk of the first file and pass it to the XML processor. + * Then we handle the second chunk of the first file, and so on, until the first + * file is fully processed. Only then we move to the second file. + * + * Then, once the ZIP decoder exhausted the data for the first ZIP file, we move + * to the second ZIP file, and so on. + * + * This way we can maintain a predictable $context variable that carries upstream + * metadata and exposes methods like skip_file(). + */ + protected function tick(): bool { + if($this->last_stream->controller->is_output_eof()) { + $this->controller->finish(); + return false; + } + + while(true) { + $bytes = $this->controller->consume_input_bytes(); + if(null === $bytes || false === $bytes) { + break; + } + $this->first_stream->append_bytes( + $bytes + ); + } + + if($this->controller->is_output_eof()) { + $this->first_stream->controller->input_eof(); + } + + if(empty($this->execution_stack)) { + array_push($this->execution_stack, $this->first_stream); + } + + while (count($this->execution_stack)) { + // Unpeel the context stack until we find a stream that + // produces output. + $stream = $this->pop_stream(); + if ($stream->controller->is_output_eof()) { + continue; + } + + if(true !== $this->stream_next($stream)) { + continue; + } + + // We've got output from the stream, yay! Let's + // propagate it downstream. + $this->push_stream($stream); + + $prev_stream = $stream; + for ($i = count($this->execution_stack); $i < count($this->streams_names); $i++) { + $next_stream = $this->streams[$this->streams_names[$i]]; + if($prev_stream->controller->is_output_eof()) { + $next_stream->controller->input_eof(); + } + + $next_stream->append_bytes( + $prev_stream->controller->output_bytes, + $this->tick_context + ); + if (true !== $this->stream_next($next_stream)) { + return false; + } + $this->push_stream($next_stream); + $prev_stream = $next_stream; + } + + // When the last process in the chain produces output, + // we write it to the output pipe and bale. + if($this->last_stream->controller->is_output_eof()) { + $this->controller->finish(); + break; + } + $this->controller->file_id = $this->last_stream->controller->get_file_id(); + $this->controller->output_bytes = $this->last_stream->get_bytes(); + + ++$this->chunk_nb; + return true; + } + + // We produced no output and the upstream pipe is EOF. + // We're done. + if($this->first_stream->controller->is_output_eof()) { + $this->finish(); + } + + return false; + } + + protected function finish() + { + $this->controller->finish(); + foreach($this->streams as $stream) { + $stream->controller->finish(); + } + } + + private function pop_stream(): Byte_Stream + { + $name = $this->streams_names[count($this->execution_stack) - 1]; + unset($this->tick_context[$name]); + return array_pop($this->execution_stack); + } + + private function push_stream(Byte_Stream $stream) + { + array_push($this->execution_stack, $stream); + $name = $this->streams_names[count($this->execution_stack) - 1]; + if($stream instanceof Demultiplexer) { + $stream = $stream->get_substream(); + } + $this->tick_context[$name] = $stream; + } + + private function stream_next(Byte_Stream $stream) + { + $produced_output = $stream->next_bytes(); + $this->handle_errors($stream); + return $produced_output; + } + + private function handle_errors(Byte_Stream $stream) + { + if($stream->controller->get_last_error()) { + $name = array_search($stream, $this->streams); + $this->controller->set_last_error("Process $name has crashed"); + } + } + + // Iterator methods. These don't make much sense on a regular + // process class because they cannot pull more input chunks from + // the top of the stream like ProcessChain can. + + public function current(): mixed { + if($this->should_iterate_errors && $this->get_last_error()) { + return $this->get_last_error(); + } + return $this->get_bytes(); + } + + private $chunk_nb = -1; + public function key(): mixed { + return $this->chunk_nb; + } + + public function rewind(): void { + $this->next(); + } + + private $should_iterate_errors = false; + public function iterate_errors($should_iterate_errors) + { + $this->should_iterate_errors = $should_iterate_errors; + } + + public function next(): void { + ++$this->chunk_nb; + while(!$this->next_bytes()) { + if($this->should_iterate_errors && $this->controller->get_last_error()) { + break; + } + if($this->controller->is_output_eof()) { + break; + } + usleep(10000); + } + } + + public function valid(): bool { + return !$this->controller->is_output_eof() || ($this->should_iterate_errors && $this->controller->get_last_error()); + } + + + // ArrayAccess on ProcessChain exposes specific + // sub-processes by their names. + public function offsetExists($offset): bool { + return isset($this->tick_context[$offset]); + } + + public function offsetGet($offset): mixed { + return $this->tick_context[$offset] ?? null; + } + + public function offsetSet($offset, $value): void { + // No op + } + + public function offsetUnset($offset): void { + // No op + } + +} + + +// Imagine this method is implemented in the WP_XML_Processor +class XML_Processor +{ + static public function stream() + { + return new Demultiplexer(function () { + $xml_processor = new WP_XML_Processor('', [], WP_XML_Processor::IN_PROLOG_CONTEXT); + $node_visitor_callback = function () {}; + return new ProcessorByteStream($xml_processor, function (ByteStreamController $controller) use ($xml_processor, $node_visitor_callback) { + $new_bytes = $controller->consume_input_bytes(); + if (null !== $new_bytes) { + $xml_processor->append_bytes($new_bytes); + } + + $tokens_found = 0; + while ($xml_processor->next_token()) { + ++$tokens_found; + $node_visitor_callback($xml_processor); + } + + $buffer = ''; + if ($tokens_found > 0) { + $buffer .= $xml_processor->get_updated_xml(); + } else if ( + $tokens_found === 0 && + !$xml_processor->is_paused_at_incomplete_input() && + $xml_processor->get_current_depth() === 0 + ) { + // We've reached the end of the document, let's finish up. + // @TODO: Fix this so it doesn't return the entire XML + $buffer .= $xml_processor->get_unprocessed_xml(); + $controller->finish(); + } + + if (!strlen($buffer)) { + return false; + } + + $controller->output_bytes = $buffer; + return true; + }); + }); + } +} + +// Imagine this method is implemented in the Client class +class HTTP_Client +{ + static public function stream($requests) + { + $client = new Client(); + $client->enqueue($requests); + return new Byte_Stream(function (ByteStreamController $controller) use ($client) { + $request = null; + while ($client->await_next_event()) { + $request = $client->get_request(); + switch ($client->get_event()) { + case Client::EVENT_BODY_CHUNK_AVAILABLE: + $controller->file_id = $request->id; + $controller->output_bytes = $client->get_response_body_chunk(); + return true; + case Client::EVENT_FAILED: + $controller->set_last_error('Request failed: ' . $request->error); + break; + } + } + + $controller->finish(); + return false; + }); + } +} + +// Imagine this method is implemented in the ZipStreamReader class +class ZIP_Processor +{ + static public function stream() + { + return new Demultiplexer(function () { + $zip_reader = new ZipStreamReader(''); + return new ProcessorByteStream($zip_reader, function (ByteStreamController $controller) use ($zip_reader) { + $new_bytes = $controller->consume_input_bytes(); + if (null !== $new_bytes) { + $zip_reader->append_bytes($new_bytes); + } + + while ($zip_reader->next()) { + switch ($zip_reader->get_state()) { + case ZipStreamReader::STATE_FILE_ENTRY: + $controller->file_id = $zip_reader->get_file_path(); + $controller->output_bytes = $zip_reader->get_file_body_chunk(); + return true; + } + } + + return false; + }); + }); + } +} + +$chain = new StreamChain( + [ + 'http' => HTTP_Client::stream([ + new Request('http://127.0.0.1:9864/export.wxr.zip'), + // new Request('http://127.0.0.1:9864/export.wxr.zip'), + // Bad request, will fail: + new Request('http://127.0.0.1:9865') + ]), + 'zip' => ZIP_Processor::stream(), + Byte_Stream::map(function($bytes, $context) { + if($context['zip']->get_file_id() === 'export.wxr') { + $context['zip']->skip_file(); + return null; + } + return $bytes; + }), + 'xml' => XML_Processor::stream(), + Byte_Stream::map(function($bytes) { return strtoupper($bytes); }), + ] +); + +// Consume the data like this: +// var_dump([$chain->next_chunk(), strlen($chain->get_bytes()), $chain->get_last_error()]); + +// Or like this: +// $chain->iterate_errors(true); +foreach($chain as $k => $chunk_or_error) { + var_dump([ + $k => $chunk_or_error, + 'is_error' => !!$chain->get_last_error(), + 'zip file_id' => isset($chain['zip']) ? $chain['zip']->get_file_id() : null + ]); +} + From 0650f8a68ad92c88a0363e635b39a1bda45147a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 31 Jul 2024 01:42:18 +0200 Subject: [PATCH 61/72] Move more logic methods to Byte_Stream --- pipes-controller.php | 185 ++++++++++++++++++++----------------------- 1 file changed, 85 insertions(+), 100 deletions(-) diff --git a/pipes-controller.php b/pipes-controller.php index d00ff30..c7acc7a 100644 --- a/pipes-controller.php +++ b/pipes-controller.php @@ -9,74 +9,86 @@ class Byte_Stream { protected $next_bytes_callback; - protected $controller; + protected $state; static public function map($callback) { - return new Byte_Stream(function($controller) use ($callback) { - $bytes = $controller->consume_input_bytes(); + return new Byte_Stream(function($state) use ($callback) { + $bytes = $state->consume_input_bytes(); if(!$bytes) { return false; } - $output = $callback($bytes, $controller->input_context); + $output = $callback($bytes, $state->input_context); if(null === $output) { return false; } - $controller->output_bytes = $output; + $state->output_bytes = $output; return true; }); } public function __construct($next_bytes_callback) { $this->next_bytes_callback = $next_bytes_callback; - $this->controller = new ByteStreamController(); + $this->state = new ByteStreamState(); + } + + public function is_eof(): bool { + return !$this->state->output_bytes && $this->state->state === ByteStreamState::STATE_FINISHED; } public function get_file_id() { - return $this->controller->get_file_id(); + return $this->state->file_id; } - public function skip_file() + public function skip_file(): void { + $this->state->last_skipped_file = $this->state->file_id; + } + + public function is_skipped_file() { - $this->controller->skip_file(); + return $this->state->file_id === $this->state->last_skipped_file; } - public function append_bytes(string $bytes, $context = null) { - $this->controller->append_bytes($bytes, $context); + public function append_eof() { + $this->state->input_eof = true; } - public function get_bytes() - { - return $this->controller->output_bytes; + public function append_bytes(string $bytes, $context = null) { + $this->state->input_bytes .= $bytes; + $this->state->input_context = $context; } - public function get_last_error(): string|null + public function get_bytes() { - return $this->controller->get_last_error(); + return $this->state->output_bytes; } public function next_bytes() { - $this->controller->output_bytes = null; - $this->controller->last_error = null; - if($this->controller->is_output_eof()) { + $this->state->reset_output(); + if($this->is_eof()) { return false; } // Process any remaining buffered input: $calback = $this->next_bytes_callback; - if($calback($this->controller)) { - return ! $this->controller->is_skipped_file(); + if($calback($this->state)) { + return ! $this->is_skipped_file(); } - if (!$this->controller->input_bytes) { - if ($this->controller->input_eof) { + if (!$this->state->input_bytes) { + if ($this->state->input_eof) { $this->finish(); } return false; } - return $calback($this->controller) && ! $this->controller->is_skipped_file(); + return $calback($this->state) && ! $this->is_skipped_file(); + } + + public function get_last_error(): string|null + { + return $this->state->last_error; } } @@ -100,7 +112,7 @@ public function __construct($processor, $callback) * are only Byte Streams that can be chained together with the StreamChain * class. */ -class ByteStreamController { +class ByteStreamState { const STATE_STREAMING = '#streaming'; const STATE_FINISHED = '#finished'; @@ -114,51 +126,24 @@ class ByteStreamController { public $file_id; public $last_skipped_file; - public function append_bytes(string $bytes, $context = null) { - $this->input_bytes .= $bytes; - $this->input_context = $context; - } - - public function input_eof() { - $this->input_eof = true; - } - - public function is_output_eof(): bool { - return !$this->output_bytes && $this->state === self::STATE_FINISHED; - } - - public function get_last_error(): ?string { - return $this->last_error; + public function reset_output() + { + $this->output_bytes = null; + $this->file_id = 'default'; + $this->last_error = null; } - public function set_last_error($error) { - $this->last_error = $error; - } - public function consume_input_bytes() { $bytes = $this->input_bytes; $this->input_bytes = null; return $bytes; } - public function get_file_id() - { - return $this->file_id ?? 'default'; - } - - public function is_skipped_file() - { - return $this->get_file_id() === $this->last_skipped_file; - } - public function finish() { $this->state = self::STATE_FINISHED; } - public function skip_file(): void { - $this->last_skipped_file = $this->file_id; - } } function is_wxr_content_node( WP_XML_Processor $processor ) { @@ -215,13 +200,13 @@ protected function tick(): bool } if($stream->next_bytes()) { - $this->controller->file_id = $stream->controller->get_file_id(); - $this->controller->output_bytes = $stream->get_bytes(); + $this->state->file_id = $stream->state->file_id; + $this->state->output_bytes = $stream->state->output_bytes; return true; } - if($stream->get_last_error()) { - $this->controller->set_last_error($stream->get_last_error()); + if($stream->state->last_error) { + $this->state->last_error = $stream->state->last_error; } return false; } @@ -231,7 +216,7 @@ public function append_bytes(string $data, $context = null): bool { if($context) { $chunk_key = []; foreach($context as $k=>$stream) { - $chunk_key[] = $stream->controller->get_file_id(); + $chunk_key[] = $stream->state->file_id; } $chunk_key = implode(':', $chunk_key); } @@ -242,16 +227,16 @@ public function append_bytes(string $data, $context = null): bool { $this->streams[$chunk_key] = $create(); } $stream = $this->streams[$chunk_key]; - $stream->controller->append_bytes($data, $context); + $stream->append_bytes($data, $context); $this->last_stream = $stream; return true; } protected function finish() { - $this->controller->finish(); + $this->state->finish(); foreach($this->streams as $stream) { - $stream->controller->finish(); + $stream->state->finish(); } } } @@ -306,13 +291,13 @@ public function __construct($streams) { * metadata and exposes methods like skip_file(). */ protected function tick(): bool { - if($this->last_stream->controller->is_output_eof()) { - $this->controller->finish(); + if($this->last_stream->is_eof()) { + $this->state->finish(); return false; } while(true) { - $bytes = $this->controller->consume_input_bytes(); + $bytes = $this->state->consume_input_bytes(); if(null === $bytes || false === $bytes) { break; } @@ -321,8 +306,8 @@ protected function tick(): bool { ); } - if($this->controller->is_output_eof()) { - $this->first_stream->controller->input_eof(); + if($this->is_eof()) { + $this->first_stream->state->append_eof(); } if(empty($this->execution_stack)) { @@ -333,7 +318,7 @@ protected function tick(): bool { // Unpeel the context stack until we find a stream that // produces output. $stream = $this->pop_stream(); - if ($stream->controller->is_output_eof()) { + if ($stream->is_eof()) { continue; } @@ -348,12 +333,12 @@ protected function tick(): bool { $prev_stream = $stream; for ($i = count($this->execution_stack); $i < count($this->streams_names); $i++) { $next_stream = $this->streams[$this->streams_names[$i]]; - if($prev_stream->controller->is_output_eof()) { - $next_stream->controller->input_eof(); + if($prev_stream->is_eof()) { + $next_stream->state->append_eof(); } $next_stream->append_bytes( - $prev_stream->controller->output_bytes, + $prev_stream->state->output_bytes, $this->tick_context ); if (true !== $this->stream_next($next_stream)) { @@ -365,12 +350,12 @@ protected function tick(): bool { // When the last process in the chain produces output, // we write it to the output pipe and bale. - if($this->last_stream->controller->is_output_eof()) { - $this->controller->finish(); + if($this->last_stream->is_eof()) { + $this->state->finish(); break; } - $this->controller->file_id = $this->last_stream->controller->get_file_id(); - $this->controller->output_bytes = $this->last_stream->get_bytes(); + $this->state->file_id = $this->last_stream->state->file_id; + $this->state->output_bytes = $this->last_stream->state->output_bytes; ++$this->chunk_nb; return true; @@ -378,7 +363,7 @@ protected function tick(): bool { // We produced no output and the upstream pipe is EOF. // We're done. - if($this->first_stream->controller->is_output_eof()) { + if($this->first_stream->is_eof()) { $this->finish(); } @@ -387,9 +372,9 @@ protected function tick(): bool { protected function finish() { - $this->controller->finish(); + $this->state->finish(); foreach($this->streams as $stream) { - $stream->controller->finish(); + $stream->state->finish(); } } @@ -397,7 +382,7 @@ private function pop_stream(): Byte_Stream { $name = $this->streams_names[count($this->execution_stack) - 1]; unset($this->tick_context[$name]); - return array_pop($this->execution_stack); + return array_pop($this->execution_stack); } private function push_stream(Byte_Stream $stream) @@ -419,9 +404,9 @@ private function stream_next(Byte_Stream $stream) private function handle_errors(Byte_Stream $stream) { - if($stream->controller->get_last_error()) { + if($stream->state->last_error) { $name = array_search($stream, $this->streams); - $this->controller->set_last_error("Process $name has crashed"); + $this->state->last_error = "Process $name has crashed (" . $stream->state->last_error . ")"; } } @@ -454,10 +439,10 @@ public function iterate_errors($should_iterate_errors) public function next(): void { ++$this->chunk_nb; while(!$this->next_bytes()) { - if($this->should_iterate_errors && $this->controller->get_last_error()) { + if($this->should_iterate_errors && $this->state->last_error) { break; } - if($this->controller->is_output_eof()) { + if($this->is_eof()) { break; } usleep(10000); @@ -465,7 +450,7 @@ public function next(): void { } public function valid(): bool { - return !$this->controller->is_output_eof() || ($this->should_iterate_errors && $this->controller->get_last_error()); + return !$this->is_eof() || ($this->should_iterate_errors && $this->state->last_error); } @@ -498,8 +483,8 @@ static public function stream() return new Demultiplexer(function () { $xml_processor = new WP_XML_Processor('', [], WP_XML_Processor::IN_PROLOG_CONTEXT); $node_visitor_callback = function () {}; - return new ProcessorByteStream($xml_processor, function (ByteStreamController $controller) use ($xml_processor, $node_visitor_callback) { - $new_bytes = $controller->consume_input_bytes(); + return new ProcessorByteStream($xml_processor, function (ByteStreamState $state) use ($xml_processor, $node_visitor_callback) { + $new_bytes = $state->consume_input_bytes(); if (null !== $new_bytes) { $xml_processor->append_bytes($new_bytes); } @@ -521,14 +506,14 @@ static public function stream() // We've reached the end of the document, let's finish up. // @TODO: Fix this so it doesn't return the entire XML $buffer .= $xml_processor->get_unprocessed_xml(); - $controller->finish(); + $state->finish(); } if (!strlen($buffer)) { return false; } - $controller->output_bytes = $buffer; + $state->output_bytes = $buffer; return true; }); }); @@ -542,22 +527,22 @@ static public function stream($requests) { $client = new Client(); $client->enqueue($requests); - return new Byte_Stream(function (ByteStreamController $controller) use ($client) { + return new Byte_Stream(function (ByteStreamState $state) use ($client) { $request = null; while ($client->await_next_event()) { $request = $client->get_request(); switch ($client->get_event()) { case Client::EVENT_BODY_CHUNK_AVAILABLE: - $controller->file_id = $request->id; - $controller->output_bytes = $client->get_response_body_chunk(); + $state->file_id = $request->id; + $state->output_bytes = $client->get_response_body_chunk(); return true; case Client::EVENT_FAILED: - $controller->set_last_error('Request failed: ' . $request->error); + $state->last_error = 'Request failed: ' . $request->error; break; } } - $controller->finish(); + $state->finish(); return false; }); } @@ -570,8 +555,8 @@ static public function stream() { return new Demultiplexer(function () { $zip_reader = new ZipStreamReader(''); - return new ProcessorByteStream($zip_reader, function (ByteStreamController $controller) use ($zip_reader) { - $new_bytes = $controller->consume_input_bytes(); + return new ProcessorByteStream($zip_reader, function (ByteStreamState $state) use ($zip_reader) { + $new_bytes = $state->consume_input_bytes(); if (null !== $new_bytes) { $zip_reader->append_bytes($new_bytes); } @@ -579,8 +564,8 @@ static public function stream() while ($zip_reader->next()) { switch ($zip_reader->get_state()) { case ZipStreamReader::STATE_FILE_ENTRY: - $controller->file_id = $zip_reader->get_file_path(); - $controller->output_bytes = $zip_reader->get_file_body_chunk(); + $state->file_id = $zip_reader->get_file_path(); + $state->output_bytes = $zip_reader->get_file_body_chunk(); return true; } } From 9d4f77ce5cbc2869bc8d0e2b72d68a0f2c0618ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 31 Jul 2024 01:46:03 +0200 Subject: [PATCH 62/72] Call append_eof() on stream, not stream state --- pipes-controller.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes-controller.php b/pipes-controller.php index c7acc7a..e1d9b44 100644 --- a/pipes-controller.php +++ b/pipes-controller.php @@ -334,7 +334,7 @@ protected function tick(): bool { for ($i = count($this->execution_stack); $i < count($this->streams_names); $i++) { $next_stream = $this->streams[$this->streams_names[$i]]; if($prev_stream->is_eof()) { - $next_stream->state->append_eof(); + $next_stream->append_eof(); } $next_stream->append_bytes( From 0a8ccc3098d1efe6fb6c1ed11ee0379c7d480c9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 31 Jul 2024 01:49:08 +0200 Subject: [PATCH 63/72] Simplify ProcessorByteStream-based streams --- pipes-controller.php | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/pipes-controller.php b/pipes-controller.php index e1d9b44..75b043c 100644 --- a/pipes-controller.php +++ b/pipes-controller.php @@ -100,7 +100,13 @@ class ProcessorByteStream extends Byte_Stream public function __construct($processor, $callback) { $this->processor = $processor; - parent::__construct($callback); + parent::__construct(function($state) use($processor, $callback) { + $new_bytes = $state->consume_input_bytes(); + if (null !== $new_bytes) { + $processor->append_bytes($new_bytes); + } + return $callback($state); + }); } } @@ -478,17 +484,11 @@ public function offsetUnset($offset): void { // Imagine this method is implemented in the WP_XML_Processor class XML_Processor { - static public function stream() + static public function stream($node_visitor_callback) { - return new Demultiplexer(function () { + return new Demultiplexer(function () use ($node_visitor_callback) { $xml_processor = new WP_XML_Processor('', [], WP_XML_Processor::IN_PROLOG_CONTEXT); - $node_visitor_callback = function () {}; return new ProcessorByteStream($xml_processor, function (ByteStreamState $state) use ($xml_processor, $node_visitor_callback) { - $new_bytes = $state->consume_input_bytes(); - if (null !== $new_bytes) { - $xml_processor->append_bytes($new_bytes); - } - $tokens_found = 0; while ($xml_processor->next_token()) { ++$tokens_found; @@ -556,11 +556,6 @@ static public function stream() return new Demultiplexer(function () { $zip_reader = new ZipStreamReader(''); return new ProcessorByteStream($zip_reader, function (ByteStreamState $state) use ($zip_reader) { - $new_bytes = $state->consume_input_bytes(); - if (null !== $new_bytes) { - $zip_reader->append_bytes($new_bytes); - } - while ($zip_reader->next()) { switch ($zip_reader->get_state()) { case ZipStreamReader::STATE_FILE_ENTRY: @@ -569,7 +564,6 @@ static public function stream() return true; } } - return false; }); }); @@ -592,7 +586,7 @@ static public function stream() } return $bytes; }), - 'xml' => XML_Processor::stream(), + 'xml' => XML_Processor::stream(function () { }), Byte_Stream::map(function($bytes) { return strtoupper($bytes); }), ] ); From 57e6594261c0d1b517aabd207b58de4173c162e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 31 Jul 2024 11:57:15 +0200 Subject: [PATCH 64/72] Rename tick() to generate_next_chunk() --- pipes-controller.php | 76 ++++++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 28 deletions(-) diff --git a/pipes-controller.php b/pipes-controller.php index 75b043c..05ddd28 100644 --- a/pipes-controller.php +++ b/pipes-controller.php @@ -6,13 +6,12 @@ use WordPress\AsyncHttp\Client; use WordPress\AsyncHttp\Request; -class Byte_Stream { +abstract class Byte_Stream { - protected $next_bytes_callback; protected $state; static public function map($callback) { - return new Byte_Stream(function($state) use ($callback) { + return new Callback_Byte_Stream(function($state) use ($callback) { $bytes = $state->consume_input_bytes(); if(!$bytes) { return false; @@ -26,8 +25,7 @@ static public function map($callback) { }); } - public function __construct($next_bytes_callback) { - $this->next_bytes_callback = $next_bytes_callback; + public function __construct() { $this->state = new ByteStreamState(); } @@ -71,8 +69,7 @@ public function next_bytes() } // Process any remaining buffered input: - $calback = $this->next_bytes_callback; - if($calback($this->state)) { + if($this->generate_next_chunk()) { return ! $this->is_skipped_file(); } @@ -83,9 +80,13 @@ public function next_bytes() return false; } - return $calback($this->state) && ! $this->is_skipped_file(); + $produced_bytes = $this->generate_next_chunk(); + + return $produced_bytes && ! $this->is_skipped_file(); } + abstract protected function generate_next_chunk(): bool; + public function get_last_error(): string|null { return $this->state->last_error; @@ -93,20 +94,39 @@ public function get_last_error(): string|null } +class Callback_Byte_Stream extends Byte_Stream { + + protected $generate_next_chunk_callback; + + public function __construct($generate_next_chunk_callback) { + $this->generate_next_chunk_callback = $generate_next_chunk_callback; + parent::__construct(); + } + + protected function generate_next_chunk(): bool { + return ($this->generate_next_chunk_callback)($this->state); + } + +} + class ProcessorByteStream extends Byte_Stream { public $processor; + protected $generate_next_chunk_callback; - public function __construct($processor, $callback) + public function __construct($processor, $generate_next_chunk_callback) { $this->processor = $processor; - parent::__construct(function($state) use($processor, $callback) { - $new_bytes = $state->consume_input_bytes(); - if (null !== $new_bytes) { - $processor->append_bytes($new_bytes); - } - return $callback($state); - }); + $this->generate_next_chunk_callback = $generate_next_chunk_callback; + parent::__construct(); + } + + protected function generate_next_chunk(): bool { + $new_bytes = $this->state->consume_input_bytes(); + if (null !== $new_bytes) { + $this->processor->append_bytes($new_bytes); + } + return ($this->generate_next_chunk_callback)($this->state); } } @@ -190,7 +210,7 @@ class Demultiplexer extends Byte_Stream { public function __construct($stream_factory) { $this->stream_factory = $stream_factory; - parent::__construct([$this, 'tick']); + parent::__construct(); } public function get_substream() @@ -198,7 +218,7 @@ public function get_substream() return $this->last_stream; } - protected function tick(): bool + protected function generate_next_chunk(): bool { $stream = $this->last_stream; if (!$stream) { @@ -256,7 +276,7 @@ class StreamChain extends Byte_Stream implements ArrayAccess, Iterator { private $streams = []; private $streams_names = []; private $execution_stack = []; - private $tick_context = []; + private $chunk_context = []; public function __construct($streams) { $named_streams = []; @@ -269,11 +289,11 @@ public function __construct($streams) { $this->streams_names = array_keys($this->streams); $this->first_stream = $this->streams[$this->streams_names[0]]; $this->last_stream = $this->streams[$this->streams_names[count($streams) - 1]]; - parent::__construct([$this, 'tick']); + parent::__construct(); } /** - * ## Process chain tick + * ## Next chunk generation * * Pushes data through a chain of streams. Every downstream data chunk * is fully processed before asking for more chunks upstream. @@ -296,7 +316,7 @@ public function __construct($streams) { * This way we can maintain a predictable $context variable that carries upstream * metadata and exposes methods like skip_file(). */ - protected function tick(): bool { + protected function generate_next_chunk(): bool { if($this->last_stream->is_eof()) { $this->state->finish(); return false; @@ -345,7 +365,7 @@ protected function tick(): bool { $next_stream->append_bytes( $prev_stream->state->output_bytes, - $this->tick_context + $this->chunk_context ); if (true !== $this->stream_next($next_stream)) { return false; @@ -387,7 +407,7 @@ protected function finish() private function pop_stream(): Byte_Stream { $name = $this->streams_names[count($this->execution_stack) - 1]; - unset($this->tick_context[$name]); + unset($this->chunk_context[$name]); return array_pop($this->execution_stack); } @@ -398,7 +418,7 @@ private function push_stream(Byte_Stream $stream) if($stream instanceof Demultiplexer) { $stream = $stream->get_substream(); } - $this->tick_context[$name] = $stream; + $this->chunk_context[$name] = $stream; } private function stream_next(Byte_Stream $stream) @@ -463,11 +483,11 @@ public function valid(): bool { // ArrayAccess on ProcessChain exposes specific // sub-processes by their names. public function offsetExists($offset): bool { - return isset($this->tick_context[$offset]); + return isset($this->chunk_context[$offset]); } public function offsetGet($offset): mixed { - return $this->tick_context[$offset] ?? null; + return $this->chunk_context[$offset] ?? null; } public function offsetSet($offset, $value): void { @@ -527,7 +547,7 @@ static public function stream($requests) { $client = new Client(); $client->enqueue($requests); - return new Byte_Stream(function (ByteStreamState $state) use ($client) { + return new Callback_Byte_Stream(function (ByteStreamState $state) use ($client) { $request = null; while ($client->await_next_event()) { $request = $client->get_request(); From c1d8ed309f94e5843773e243da0655d7ce8fc91b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 31 Jul 2024 12:18:15 +0200 Subject: [PATCH 65/72] Small refactor --- pipes-controller.php | 71 ++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 32 deletions(-) diff --git a/pipes-controller.php b/pipes-controller.php index 05ddd28..64ff614 100644 --- a/pipes-controller.php +++ b/pipes-controller.php @@ -9,22 +9,7 @@ abstract class Byte_Stream { protected $state; - - static public function map($callback) { - return new Callback_Byte_Stream(function($state) use ($callback) { - $bytes = $state->consume_input_bytes(); - if(!$bytes) { - return false; - } - $output = $callback($bytes, $state->input_context); - if(null === $output) { - return false; - } - $state->output_bytes = $output; - return true; - }); - } - + public function __construct() { $this->state = new ByteStreamState(); } @@ -92,6 +77,22 @@ public function get_last_error(): string|null return $this->state->last_error; } + // Utility methods + static public function map($mapper) { + return new Callback_Byte_Stream(function($state) use ($mapper) { + $bytes = $state->consume_input_bytes(); + if(!$bytes) { + return false; + } + $output = $mapper($bytes, $state->input_context); + if(null === $output) { + return false; + } + $state->output_bytes = $output; + return true; + }); + } + } class Callback_Byte_Stream extends Byte_Stream { @@ -109,11 +110,22 @@ protected function generate_next_chunk(): bool { } + class ProcessorByteStream extends Byte_Stream { public $processor; protected $generate_next_chunk_callback; + static public function demuxed($processor_factory, $callback) + { + return new Demultiplexer(function () use ($processor_factory, $callback) { + $processor = $processor_factory(); + return new ProcessorByteStream($processor, function($state) use($processor, $callback) { + return $callback($processor, $state); + }); + }); + } + public function __construct($processor, $generate_next_chunk_callback) { $this->processor = $processor; @@ -424,16 +436,11 @@ private function push_stream(Byte_Stream $stream) private function stream_next(Byte_Stream $stream) { $produced_output = $stream->next_bytes(); - $this->handle_errors($stream); - return $produced_output; - } - - private function handle_errors(Byte_Stream $stream) - { if($stream->state->last_error) { $name = array_search($stream, $this->streams); $this->state->last_error = "Process $name has crashed (" . $stream->state->last_error . ")"; } + return $produced_output; } // Iterator methods. These don't make much sense on a regular @@ -506,9 +513,9 @@ class XML_Processor { static public function stream($node_visitor_callback) { - return new Demultiplexer(function () use ($node_visitor_callback) { - $xml_processor = new WP_XML_Processor('', [], WP_XML_Processor::IN_PROLOG_CONTEXT); - return new ProcessorByteStream($xml_processor, function (ByteStreamState $state) use ($xml_processor, $node_visitor_callback) { + return ProcessorByteStream::demuxed( + function () { return new WP_XML_Processor('', [], WP_XML_Processor::IN_PROLOG_CONTEXT); }, + function (WP_XML_Processor $xml_processor, ByteStreamState $state) use ($node_visitor_callback) { $tokens_found = 0; while ($xml_processor->next_token()) { ++$tokens_found; @@ -535,8 +542,8 @@ static public function stream($node_visitor_callback) $state->output_bytes = $buffer; return true; - }); - }); + } + ); } } @@ -573,9 +580,9 @@ class ZIP_Processor { static public function stream() { - return new Demultiplexer(function () { - $zip_reader = new ZipStreamReader(''); - return new ProcessorByteStream($zip_reader, function (ByteStreamState $state) use ($zip_reader) { + return ProcessorByteStream::demuxed( + function () { return new ZipStreamReader(); }, + function (ZipStreamReader $zip_reader, ByteStreamState $state) { while ($zip_reader->next()) { switch ($zip_reader->get_state()) { case ZipStreamReader::STATE_FILE_ENTRY: @@ -585,8 +592,8 @@ static public function stream() } } return false; - }); - }); + } + ); } } From 98c2e36354526fec6dcae29e87009694b4b106ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 31 Jul 2024 12:26:08 +0200 Subject: [PATCH 66/72] Inheritance: Byte_Stream > Callback_Byte_Stream > Processor_Byte_Stream --- pipes-controller.php | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/pipes-controller.php b/pipes-controller.php index 64ff614..9612a2e 100644 --- a/pipes-controller.php +++ b/pipes-controller.php @@ -111,35 +111,29 @@ protected function generate_next_chunk(): bool { } -class ProcessorByteStream extends Byte_Stream +class ProcessorByteStream extends Callback_Byte_Stream { public $processor; - protected $generate_next_chunk_callback; + + public function __construct($processor, $generate_next_chunk_callback) + { + $this->processor = $processor; + parent::__construct($generate_next_chunk_callback); + } static public function demuxed($processor_factory, $callback) { return new Demultiplexer(function () use ($processor_factory, $callback) { $processor = $processor_factory(); return new ProcessorByteStream($processor, function($state) use($processor, $callback) { + $new_bytes = $state->consume_input_bytes(); + if (null !== $new_bytes) { + $processor->append_bytes($new_bytes); + } return $callback($processor, $state); }); }); } - - public function __construct($processor, $generate_next_chunk_callback) - { - $this->processor = $processor; - $this->generate_next_chunk_callback = $generate_next_chunk_callback; - parent::__construct(); - } - - protected function generate_next_chunk(): bool { - $new_bytes = $this->state->consume_input_bytes(); - if (null !== $new_bytes) { - $this->processor->append_bytes($new_bytes); - } - return ($this->generate_next_chunk_callback)($this->state); - } } /** From 97bdc317f28dfab78721f109225915a85557db58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 31 Jul 2024 12:34:12 +0200 Subject: [PATCH 67/72] Rename ZIP_Processor to ZIP_Reader --- pipes-controller.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipes-controller.php b/pipes-controller.php index 9612a2e..7ae70b1 100644 --- a/pipes-controller.php +++ b/pipes-controller.php @@ -570,7 +570,7 @@ static public function stream($requests) } // Imagine this method is implemented in the ZipStreamReader class -class ZIP_Processor +class ZIP_Reader { static public function stream() { @@ -599,7 +599,7 @@ function (ZipStreamReader $zip_reader, ByteStreamState $state) { // Bad request, will fail: new Request('http://127.0.0.1:9865') ]), - 'zip' => ZIP_Processor::stream(), + 'zip' => ZIP_Reader::stream(), Byte_Stream::map(function($bytes, $context) { if($context['zip']->get_file_id() === 'export.wxr') { $context['zip']->skip_file(); From 1234a10d0122f1a7657f6bb520330ac5a77d3595 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 31 Jul 2024 12:50:11 +0200 Subject: [PATCH 68/72] Return StreamChain from the current() method --- pipes-controller.php | 56 +++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/pipes-controller.php b/pipes-controller.php index 7ae70b1..d70c772 100644 --- a/pipes-controller.php +++ b/pipes-controller.php @@ -32,6 +32,19 @@ public function is_skipped_file() return $this->state->file_id === $this->state->last_skipped_file; } + public function get_chunk_type() + { + if($this->get_last_error()) { + return '#error'; + } + + if ($this->is_eof()) { + return '#eof'; + } + + return '#bytes'; + } + public function append_eof() { $this->state->input_eof = true; } @@ -285,6 +298,8 @@ class StreamChain extends Byte_Stream implements ArrayAccess, Iterator { private $chunk_context = []; public function __construct($streams) { + $this->chunk_context['chain'] = $this; + $named_streams = []; foreach($streams as $name => $stream) { $string_name = is_numeric($name) ? 'stream_' . $name : $name; @@ -388,8 +403,6 @@ protected function generate_next_chunk(): bool { } $this->state->file_id = $this->last_stream->state->file_id; $this->state->output_bytes = $this->last_stream->state->output_bytes; - - ++$this->chunk_nb; return true; } @@ -442,31 +455,26 @@ private function stream_next(Byte_Stream $stream) // the top of the stream like ProcessChain can. public function current(): mixed { - if($this->should_iterate_errors && $this->get_last_error()) { - return $this->get_last_error(); - } - return $this->get_bytes(); + return $this; } - private $chunk_nb = -1; public function key(): mixed { - return $this->chunk_nb; + return $this->get_chunk_type(); } public function rewind(): void { $this->next(); } - private $should_iterate_errors = false; - public function iterate_errors($should_iterate_errors) + private $should_stop_on_errors = false; + public function stop_on_errors($should_stop_on_errors) { - $this->should_iterate_errors = $should_iterate_errors; + $this->should_stop_on_errors = $should_stop_on_errors; } public function next(): void { - ++$this->chunk_nb; while(!$this->next_bytes()) { - if($this->should_iterate_errors && $this->state->last_error) { + if($this->should_stop_on_errors && $this->state->last_error) { break; } if($this->is_eof()) { @@ -477,7 +485,7 @@ public function next(): void { } public function valid(): bool { - return !$this->is_eof() || ($this->should_iterate_errors && $this->state->last_error); + return !$this->is_eof() || ($this->should_stop_on_errors && $this->state->last_error); } @@ -616,12 +624,18 @@ function (ZipStreamReader $zip_reader, ByteStreamState $state) { // var_dump([$chain->next_chunk(), strlen($chain->get_bytes()), $chain->get_last_error()]); // Or like this: -// $chain->iterate_errors(true); -foreach($chain as $k => $chunk_or_error) { - var_dump([ - $k => $chunk_or_error, - 'is_error' => !!$chain->get_last_error(), - 'zip file_id' => isset($chain['zip']) ? $chain['zip']->get_file_id() : null - ]); +$chain->stop_on_errors(true); +foreach($chain as $chunk) { + switch($chunk->get_chunk_type()) { + case '#error': + echo "Error: " . $chunk->get_last_error() . "\n"; + break; + case '#bytes': + var_dump([ + $chunk->get_bytes(), + 'zip file_id' => isset($chain['zip']) ? $chain['zip']->get_file_id() : null + ]); + break; + } } From bd19ad786703e4ebba76697cc05fe33c945ea6d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 27 Aug 2024 10:06:54 +0200 Subject: [PATCH 69/72] Add a "main loop" that processes each stage of the pipeline explicitly --- pipes-controller.php | 103 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 91 insertions(+), 12 deletions(-) diff --git a/pipes-controller.php b/pipes-controller.php index d70c772..5fc8510 100644 --- a/pipes-controller.php +++ b/pipes-controller.php @@ -599,6 +599,11 @@ function (ZipStreamReader $zip_reader, ByteStreamState $state) { } } +// ---------------------------------------------------------------------------- +// Here's a stream-based pipeline that fetches a ZIP file from a remote server, +// unzips it, skips the first file, processes the XML files, and uppercases the +// output. +// ---------------------------------------------------------------------------- $chain = new StreamChain( [ 'http' => HTTP_Client::stream([ @@ -624,18 +629,92 @@ function (ZipStreamReader $zip_reader, ByteStreamState $state) { // var_dump([$chain->next_chunk(), strlen($chain->get_bytes()), $chain->get_last_error()]); // Or like this: -$chain->stop_on_errors(true); -foreach($chain as $chunk) { - switch($chunk->get_chunk_type()) { - case '#error': - echo "Error: " . $chunk->get_last_error() . "\n"; - break; - case '#bytes': - var_dump([ - $chunk->get_bytes(), - 'zip file_id' => isset($chain['zip']) ? $chain['zip']->get_file_id() : null - ]); +// $chain->stop_on_errors(true); +// foreach($chain as $chunk) { +// switch($chunk->get_chunk_type()) { +// case '#error': +// echo "Error: " . $chunk->get_last_error() . "\n"; +// break; +// case '#bytes': +// var_dump([ +// $chunk->get_bytes(), +// 'zip file_id' => isset($chain['zip']) ? $chain['zip']->get_file_id() : null +// ]); +// break; +// } +// } + + +// ---------------------------------------------------------- +// And here's a loop-based pipeline that does the same thing: +// ---------------------------------------------------------- + +$client = new Client(); +$client->enqueue([ + new Request('http://127.0.0.1:9864/export.wxr.zip'), + new Request('http://127.0.0.1:9865') +]); + +$zip_readers = []; +$xml_processors = []; +$xml_tokens_found = []; +while ($client->await_next_event()) { + // Fetch HTTP data + $request = $client->get_request(); + switch ($client->get_event()) { + case Client::EVENT_BODY_CHUNK_AVAILABLE: + // Continue to the next stage break; + case Client::EVENT_FAILED: + error_log('Request failed: ' . $request->error); + default: + continue 2; + } + + // Unzip the file + $zip_reader = $zip_readers[$request->id] ?? new ZipStreamReader(); + $zip_reader->append_bytes($client->get_response_body_chunk()); + while ($zip_reader->next()) { + switch ($zip_reader->get_state()) { + case ZipStreamReader::STATE_FILE_ENTRY: + // Continue to the next stage + break; + default: + continue 2; + } + + if($zip_reader->get_file_path() === 'export.wxr') { + continue; + } + + // Process the XML + $xml_processor = $xml_processors[$request->id] ?? new WP_XML_Processor('', [], WP_XML_Processor::IN_PROLOG_CONTEXT); + $xml_processor->append_bytes($zip_reader->get_file_body_chunk()); + + $xml_tokens_found[$request->id] ??= 0; + while ($xml_processor->next_token()) { + ++$xml_tokens_found[$request->id]; + // Process the XML + } + + $buffer = ''; + if ($xml_tokens_found[$request->id] > 0) { + $buffer .= $xml_processor->get_updated_xml(); + } else if ( + $xml_tokens_found[$request->id] === 0 && + !$xml_processor->is_paused_at_incomplete_input() && + $xml_processor->get_current_depth() === 0 + ) { + // We've reached the end of the document, let's finish up. + // @TODO: Fix this so it doesn't return the entire XML + $buffer .= $xml_processor->get_unprocessed_xml(); + } + + if (!strlen($buffer)) { + continue; + } + + // Uppercase the output + echo strtoupper($buffer); } } - From daaba8abad2d3966c83d8e8aead3b9a0a0e27b8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 27 Aug 2024 10:31:07 +0200 Subject: [PATCH 70/72] A loop-based API without nested loops --- pipes-controller.php | 140 +++++++++++++++++++++++-------------------- 1 file changed, 75 insertions(+), 65 deletions(-) diff --git a/pipes-controller.php b/pipes-controller.php index 5fc8510..50fe5dc 100644 --- a/pipes-controller.php +++ b/pipes-controller.php @@ -629,20 +629,20 @@ function (ZipStreamReader $zip_reader, ByteStreamState $state) { // var_dump([$chain->next_chunk(), strlen($chain->get_bytes()), $chain->get_last_error()]); // Or like this: -// $chain->stop_on_errors(true); -// foreach($chain as $chunk) { -// switch($chunk->get_chunk_type()) { -// case '#error': -// echo "Error: " . $chunk->get_last_error() . "\n"; -// break; -// case '#bytes': -// var_dump([ -// $chunk->get_bytes(), -// 'zip file_id' => isset($chain['zip']) ? $chain['zip']->get_file_id() : null -// ]); -// break; -// } -// } +$chain->stop_on_errors(true); +foreach($chain as $chunk) { + switch($chunk->get_chunk_type()) { + case '#error': + echo "Error: " . $chunk->get_last_error() . "\n"; + break; + case '#bytes': + var_dump([ + $chunk->get_bytes(), + 'zip file_id' => isset($chain['zip']) ? $chain['zip']->get_file_id() : null + ]); + break; + } +} // ---------------------------------------------------------- @@ -658,63 +658,73 @@ function (ZipStreamReader $zip_reader, ByteStreamState $state) { $zip_readers = []; $xml_processors = []; $xml_tokens_found = []; -while ($client->await_next_event()) { - // Fetch HTTP data - $request = $client->get_request(); - switch ($client->get_event()) { - case Client::EVENT_BODY_CHUNK_AVAILABLE: - // Continue to the next stage - break; - case Client::EVENT_FAILED: - error_log('Request failed: ' . $request->error); - default: - continue 2; - } - - // Unzip the file - $zip_reader = $zip_readers[$request->id] ?? new ZipStreamReader(); - $zip_reader->append_bytes($client->get_response_body_chunk()); - while ($zip_reader->next()) { - switch ($zip_reader->get_state()) { - case ZipStreamReader::STATE_FILE_ENTRY: - // Continue to the next stage - break; - default: - continue 2; - } - if($zip_reader->get_file_path() === 'export.wxr') { - continue; +$chunks = []; +while(true) { + if(empty($chunks)) { + $event = $client->await_next_event(); + if(false === $event) { + break; } + $chunks[] = ['http', null]; + } - // Process the XML - $xml_processor = $xml_processors[$request->id] ?? new WP_XML_Processor('', [], WP_XML_Processor::IN_PROLOG_CONTEXT); - $xml_processor->append_bytes($zip_reader->get_file_body_chunk()); + list($stage, $chunk) = array_pop($chunks); - $xml_tokens_found[$request->id] ??= 0; - while ($xml_processor->next_token()) { - ++$xml_tokens_found[$request->id]; - // Process the XML - } + switch ($stage) { + case 'http': + $request = $client->get_request(); + switch ($client->get_event()) { + case Client::EVENT_BODY_CHUNK_AVAILABLE: + $chunks[] = ['zip', $client->get_response_body_chunk()]; + break; + case Client::EVENT_FAILED: + error_log('Request failed: ' . $request->error); + break; + } + break; + case 'zip': + $zip_reader = $zip_readers[$request->id] ?? new ZipStreamReader(); + $zip_reader->append_bytes($chunk); + while ($zip_reader->next()) { + switch ($zip_reader->get_state()) { + case ZipStreamReader::STATE_FILE_ENTRY: + if ($zip_reader->get_file_path() === 'export.wxr') { + continue 2; + } + $chunks[] = ['xml', $zip_reader->get_file_body_chunk()]; + break; + } + } + break; + case 'xml': + $xml_processor = $xml_processors[$request->id] ?? new WP_XML_Processor('', [], WP_XML_Processor::IN_PROLOG_CONTEXT); + $xml_processor->append_bytes($chunk); + + $xml_tokens_found[$request->id] ??= 0; + while ($xml_processor->next_token()) { + ++$xml_tokens_found[$request->id]; + // Process the XML + } - $buffer = ''; - if ($xml_tokens_found[$request->id] > 0) { - $buffer .= $xml_processor->get_updated_xml(); - } else if ( - $xml_tokens_found[$request->id] === 0 && - !$xml_processor->is_paused_at_incomplete_input() && - $xml_processor->get_current_depth() === 0 - ) { - // We've reached the end of the document, let's finish up. - // @TODO: Fix this so it doesn't return the entire XML - $buffer .= $xml_processor->get_unprocessed_xml(); - } + $buffer = ''; + if ($xml_tokens_found[$request->id] > 0) { + $buffer .= $xml_processor->get_updated_xml(); + } else if ( + $xml_tokens_found[$request->id] === 0 && + !$xml_processor->is_paused_at_incomplete_input() && + $xml_processor->get_current_depth() === 0 + ) { + // We've reached the end of the document, let's finish up. + $buffer .= $xml_processor->get_unprocessed_xml(); + } - if (!strlen($buffer)) { - continue; - } + if (!strlen($buffer)) { + continue 2; + } - // Uppercase the output - echo strtoupper($buffer); + // Uppercase the output + echo strtoupper($buffer); + break; } } From 3c07f99510f832ac099fc22bdfc88aaa476c1bda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Mon, 30 Sep 2024 10:29:53 +0200 Subject: [PATCH 71/72] Prototype pause() and resume() methods to make the stream processing re-entrant --- class-wp-xml-processor.php | 20 + class-wp-xml-tag-processor.php | 2 +- export.wxr | 8310 ++++++++++++++++++++++++++++++++ pipes-controller-classes.php | 729 +++ pipes-controller-reentrant.php | 89 + pipes-controller.php | 596 +-- zip-stream-reader.php | 217 +- 7 files changed, 9285 insertions(+), 678 deletions(-) create mode 100644 export.wxr create mode 100644 pipes-controller-classes.php create mode 100644 pipes-controller-reentrant.php diff --git a/class-wp-xml-processor.php b/class-wp-xml-processor.php index 6d2d1ac..5b834be 100644 --- a/class-wp-xml-processor.php +++ b/class-wp-xml-processor.php @@ -88,6 +88,26 @@ public static function stream_tokens( $input_stream, $output_stream, $buffer_siz } } + public function pause() { + return new Paused_Stream(self::class, array( + 'xml' => $this->xml, + // @TODO: Include all the information below in the bookmark: + 'bytes_already_parsed' => $this->token_starts_at, + 'breadcrumbs' => $this->get_breadcrumbs(), + 'parser_context' => $this->get_parser_context(), + 'stack_of_open_elements' => $this->stack_of_open_elements, + )); + } + + public function resume($paused_state) { + $state = $paused_state['data']; + $this->xml = $state['xml']; + $this->stack_of_open_elements = $state['stack_of_open_elements']; + $this->parser_context = $state['parser_context']; + $this->bytes_already_parsed = $state['bytes_already_parsed']; + $this->base_class_next_token(); + } + /** * Wipes out the processed XML and appends the next chunk of XML to * any remaining unprocessed XML. diff --git a/class-wp-xml-tag-processor.php b/class-wp-xml-tag-processor.php index 5cae887..1a29ff7 100644 --- a/class-wp-xml-tag-processor.php +++ b/class-wp-xml-tag-processor.php @@ -443,7 +443,7 @@ class WP_XML_Tag_Processor { * * @var int|null */ - private $token_starts_at; + protected $token_starts_at; /** * Byte length of current token. diff --git a/export.wxr b/export.wxr new file mode 100644 index 0000000..a48ba58 --- /dev/null +++ b/export.wxr @@ -0,0 +1,8310 @@ + + + + + +My WordPress Website +http://127.0.0.1:9400 + +Wed, 12 Jun 2024 11:34:29 +0000 +en-US +1.2 +http://127.0.0.1:9400 +http://127.0.0.1:9400 + + 1 + admin + admin@localhost.com + + + + +https://wordpress.org/?v=6.6-beta2 + + <![CDATA[15-resources.blockhtml]]> + http://127.0.0.1:9400/?page_id=1 + + admin + + + +

Links and Resources

+ + + +

Frequently sought links

+ + + + + + + +

Apps built with WordPress Playground

+ + + + + + + +

Reading materials

+ + + + + + + +

Videos

+ + + + +]]>
+ + 1 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + open + open + + publish + 0 + 0 + page + + 0 +
+ + <![CDATA[17-changelog.blockhtml]]> + http://127.0.0.1:9400/?page_id=2 + + admin + + + +

Changelog

+ + + +

All notable changes to this project are documented in this file by a CI job
that runs on every NPM release. The file follows the Keep a Changelog
format.

+ + + +

[v0.7.20] (2024-05-21)

+ + + +

Breaking Changes

+ + + +
    +
  • [Breaking] Refactor PHP.ini management, remove php.setPhpIniPath() and php.setPhpIniEntry(). (#1423)
  • +
+ + + +

Enhancements

+ + + +
    +
  • CLI: Distinguish between mount and mountBeforeInstall options. (#1410)
  • + + + +
  • CLI: Support fetching WordPress zips from custom URLs. (#1415)
  • + + + +
  • Introduce a new @wp-playground/common package to avoid circular depencies. (#1387)
  • + + + +
  • Website: Ship the SQLite database integration plugin. (#1418)
  • +
+ + + +

Boot Flow

+ + + +
    +
  • Playground CLI: Don't create /wordpress/wp-config.php on boot. (#1407)
  • +
+ + + +

Blueprints

+ + + +
    +
  • Define constants in auto_prepend_file, silence warnings related to redefining those constants. (#1400)
  • + + + +
  • Detect silent failures when activating plugins and theme. (#1436)
  • + + + +
  • Re-activate single-file plugins when enabling a multisite. (#1435)
  • + + + +
  • Throw an error when activating a theme or plugin that doesn't exist. (#1391)
  • + + + +
  • Write sunrise.php to /internal in enableMultisite step. (#1401)
  • +
+ + + +

Tools

+ + + +
    +
  • Add VSCode branch protection. (#1408)
  • + + + +
  • Show error log if Playground fails to start. (#1336)
  • +
+ + + +

Blueprints

+ + + +
    +
  • Unzip: Only delete a temporary zip file after unzipping, do not delete the original zip. (#1412)
  • +
+ + + +

GitHub integration

+ + + +
    +
  • GitHub export: Create new commits in your fork when writing to the upstream repo isn't allowed. (#1392)
  • +
+ + + +

Import/Export

+ + + +
    +
  • Support wp_crop_image in import wxr. (#1357)
  • +
+ + + +

Devrel

+ + + +
    +
  • Add puzzle API. (#1372)
  • +
+ + + +

Documentation

+ + + +
    +
  • Docs: Use step function names instead of TypeScript type names. (#1373)
  • + + + +
  • Updated the GitHub issue link to open in a new tab. (#1353)
  • + + + +
  • Use step id name. (#1377)
  • +
+ + + +

Experiments

+ + + +
    +
  • Explore: Setup SQLite database integration without creating wp-content/db.php. (#1382)
  • +
+ + + +

PHP WebAssembly

+ + + +
    +
  • Add shareable extension-to-MIME-type mapping. (#1355)
  • + + + +
  • Document php ini functions. (#1430)
  • + + + +
  • JSPI: Enable the origin trial on Chrome. (#1346)
  • + + + +
  • PHP: Add libjpeg and libwebp support. (#1393)
  • + + + +
  • PHP: Always set the auto_prepend_file php.ini entry, even when the auto_prepend_file.php file exists. (#1388)
  • + + + +
  • PHP: Move internal shared directories to /internal/shared. (#1386)
  • + + + +
  • PHP: Remove mentions of a custom PHP extension. (#1422)
  • + + + +
  • PHP: Remove the MODE_EVAL_CODE execution mode. (#1433)
  • + + + +
  • PHP: Support php.mv() between devices via recursive copy. (#1411)
  • + + + +
  • PHP: Use /internal/shared/php.ini by default. (#1419)
  • + + + +
  • PHP: Use auto_prepend_file to preload mu-plugins (instead of creating them in wp-content/mu-plugins). (#1366)
  • +
+ + + +

Website

+ + + +
    +
  • Improve log modal styles, a11y, error message wording. (#1369)
  • + + + +
  • Move puzzle app to a Playground package. (#1385)
  • + + + +
  • Add secrets on-demand for more endpoints. (#1362)
  • + + + +
  • Boot: Move WordPress zip extraction logic to a common unzipWordPress() utility. (#1427)
  • + + + +
  • Derive MIME types for PHP served files from shared JSON. (#1360)
  • + + + +
  • Fix constant names for GH export oauth. (#1378)
  • + + + +
  • Playground Boot: Align the boot process between remote.html and CLI. (#1389)
  • + + + +
  • Remote.html: Install WordPress if it isn't installed yet. (#1425)
  • + + + +
  • Remote.html: Preload the SQLite database plugin, but only execute it if there's no custom db.php inside wp-content. (#1424)
  • + + + +
  • Simplify website deployment workflows. (#1404)
  • + + + +
  • Update rsync command to clean up more completely. (#1361)
  • +
+ + + +

Blueprints

+ + + +
    +
  • Provide non-gzipped wp-cli.phar file with website build. (#1406)
  • + + + +
  • Simplify runPhpWithZipFunctions() setup. (#1434)
  • +
+ + + +

Internal

+ + + +
    +
  • Fix changelog automation. (#1413)
  • +
+ + + +

Bug Fixes

+ + + +
    +
  • Add name to Puzzle package. (#1443)
  • + + + +
  • Fixed images not loading on the page. (#1352)
  • + + + +
  • Restore nightly wordpress build. (#1437)
  • +
+ + + +

Reliability

+ + + +
    +
  • Disable console logging when running tests. (#1368)
  • +
+ + + +

+ + + +
    +
  • Lint: Disable console warnings for paths where they're not useful. (#1421)
  • +
+ + + +

Various

+ + + +
    +
  • Add links to kitchen sink (PHP extensions), networking. (#1363)
  • + + + +
  • Reorganize and update documentation. (#1354)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel @bgrgicak @brandonpayton @flexseth @ironnysh @josevarghese

+ + + +

[v0.7.15] (2024-04-30)

+ + + +

Website

+ + + +
    +
  • Avoid edge-caching conditionally redirected resources. (#1351)
  • + + + +
  • Fix deploy-time check for file with PHP-handled redirect. (#1350)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@brandonpayton

+ + + +

[v0.7.10] (2024-04-30)

+ + + +

PHP WebAssembly

+ + + +
    +
  • PHP.wasm Node: Revert a part of #1289, do not import a .wasm file. (#1348)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel

+ + + +

[v0.7.5] (2024-04-30)

+ + + +

Internal

+ + + +
    +
  • Meta: Move the minified WordPress to the new @wp-playground/wordpress-builds package. (#1343)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel

+ + + +

[v0.7.3] (2024-04-29)

+ + + +

PHP WebAssembly

+ + + +
    +
  • Playground CLI. (#1289)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel

+ + + +

[v0.7.2] (2024-04-29)

+ + + +

Breaking Changes

+ + + +
    +
  • PHP: Remove setSapiName, setPhpIniEntry, setPhpIniPath methods from the remote PHP API client. (#1321)
  • + + + +
  • Remove the wp-playground/node package. (#1323)
  • +
+ + + +

PHP WebAssembly

+ + + +
    +
  • Breaking: Loopback Request Support. (#1287)
  • +
+ + + +

Tools

+ + + +
    +
  • Centralize log storage. (#1315)
  • +
+ + + +

Documentation

+ + + +
    +
  • Link to Installing Nx Globally in the README. (#1325)
  • +
+ + + +

PHP WebAssembly

+ + + +
    +
  • Add PHPResponse.forHttpCode() shorthand. (#1322)
  • + + + +
  • Asyncify: List ZEND_FETCH_OBJ_R_SPEC_CV_CV_HANDLER. (#1342)
  • + + + +
  • Curl extension for the Node.js build of PHP.wasm. (#1273)
  • + + + +
  • Explore curl support. (#1133)
  • + + + +
  • PHP Process Manager. (#1301)
  • + + + +
  • PHPProcessManager: Clear nextInstance when the concurrency limit is exhausted. (#1324)
  • + + + +
  • Spawn handler: Wrap the program call with try/catch, exit gracefully on error. (#1320)
  • +
+ + + +

Website

+ + + +
    +
  • Add initial workflow for deploying the website to WP Cloud. (#1293)
  • + + + +
  • Eliminate 404s due to nested files-to-serve-via-php dir. (#1333)
  • + + + +
  • Stop WP rewrite rules from matching files like wp-admin.css. (#1317)
  • + + + +
  • Stop using PHP to serve most static files on WP Cloud. (#1331)
  • + + + +
  • WP Cloud: Relay secrets for error logger. (#1337)
  • +
+ + + +

Documentation

+ + + +
    +
  • Document WP Cloud website setup. (#1338)
  • +
+ + + +

Reliability

+ + + +
    +
  • Add log methods, log handlers, and separate log collection. (#1264)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel @bgrgicak @brandonpayton @juanmaguitar @mho22

+ + + +

[v0.7.1] (2024-04-24)

+ + + +

[v0.7.0] (2024-04-24)

+ + + +

Breaking Changes

+ + + +

PHP WebAssembly

+ + + +
    +
  • Breaking: Remove PHPBrowser. (#1302)
  • +
+ + + +

Enhancements

+ + + +
    +
  • Bump TypeScript to 5.4.5. (#1299)
  • + + + +
  • Semaphore: Add timeout option. (#1300)
  • +
+ + + +

Blueprints

+ + + +
    +
  • Builder: Fix stuck loader bar. (#1284)
  • + + + +
  • Remove setPhpIniEntry step. (#1288)
  • +
+ + + +

Tools

+ + + +

GitHub integration

+ + + +
    +
  • GitHub: Don't delete all the files when exporting a theme. (#1308)
  • + + + +
  • Urlencode branch name. (#1275)
  • +
+ + + +

Blueprints

+ + + +
    +
  • Blueprints builder: Support ?blueprint-url. (#1309)
  • +
+ + + +

Documentation

+ + + +
    +
  • Use new learning resources in Playground documentation. (#1276)
  • +
+ + + +

PHP WebAssembly

+ + + +
    +
  • Browser: Remove setSpawnHandler function from the public API. (#1303)
  • + + + +
  • PHP: Add a cwd argument to hotSwapPHPRuntime(). (#1304)
  • + + + +
  • PHP: Remove addServerGlobalEntry() method, accept $_SERVER as php.run() property. (#1286)
  • + + + +
  • PHPRequestHandler: Add a generic PHP argument. (#1310)
  • + + + +
  • nit: Clean up after node PHP popen() test. (#1280)
  • +
+ + + +

Website

+ + + +
    +
  • Add more info to crash reports. (#1253)
  • + + + +
  • Memoize fetch() responses when requesting php.wasm. (#1306)
  • + + + +
  • Progress monitoring: Use a custom instantiateWasm handler to avoid monkey-patching WebAssembly.instantiateStreaming. (#1305)
  • + + + +
  • Remove sandbox attribute from iframe. (#1313)
  • + + + +
  • Service Worker: Fetch credentialless to play more nicely with server caches (#1311). (#1311)
  • +
+ + + +

Internal

+ + + +
    +
  • Automate Changelog generation after each npm release. (#1312)
  • + + + +
  • CI: Fix intermittent documentation build failures. (#1307)
  • +
+ + + +

Bug Fixes

+ + + +
    +
  • Add styles to ensure iframes are responsive. (#1267)
  • + + + +
  • Docs: Fix the Blueprint example of the Gutenberg PR preview. (#1268)
  • + + + +
  • Docs: Move Steps Shorthands to a separate page to fix Steps TOC. (#1265)
  • +
+ + + +

Reliability

+ + + +
    +
  • Add network error message. (#1281)
  • + + + +
  • Explore logging to a file. (#1292)
  • +
+ + + +

Various

+ + + +
    +
  • Add PDF to infer mime type list. (#1298)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel @bgrgicak @brandonpayton @ironnysh @peeranat-dan

+ + + +

[v0.6.16] (2024-04-17)

+ + + +

Blueprints

+ + + +
    +
  • Replace set_current_user call with wp_set_current_user to fix a PHP notice. (#1262)
  • +
+ + + +

Tools

+ + + +
    +
  • Install themes and plugins using the ReadableStream API. (#919)
  • +
+ + + +

Documentation

+ + + +
    +
  • Docs: Update WordPress versions used in the documentation, document using older releases. (#1235)
  • +
+ + + +

PHP WebAssembly

+ + + +
    +
  • Filter Requests library to use the Fetch handler. (#1048)
  • + + + +
  • PHP: Handle request errors in PHPRequestHandler, return response code 500. (#1249)
  • + + + +
  • PHP: Reset exit code before dispatching a request. (#1251)
  • +
+ + + +

Various

+ + + +
    +
  • Add documentation for shorthand alternatives of Blueprint steps. (#1261)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel @dd32 @ironnysh @kozer

+ + + +

[v0.6.15] (2024-04-16)

+ + + +

Blueprints

+ + + +
    +
  • Add ifAlreadyInstalled to installPlugin and installTheme steps. (#1244)
  • + + + +
  • Support a landingPage value without the initial slash. (#1227)
  • +
+ + + +

PHP WebAssembly

+ + + +
    +
  • Investigate OOB: Run unit tests with instrumented PHP 8.0 code. (#1220)
  • + + + +
  • Unit tests: Restore site-data.spec.ts. (#1194)
  • + + + +
  • Web PHP: Increase memory limit to 256 M. (#1232)
  • +
+ + + +

Website

+ + + +
    +
  • Browser: Display PHP output when Fatal Error is trigerred. (#1234)
  • + + + +
  • Fix accessibility issues found by Axe. (#1246)
  • + + + +
  • Request Handler: Urldecode the requested path. (#1228)
  • +
+ + + +

Bug Fixes

+ + + +
    +
  • fix: Set required engine version to 18.18.0. (#1214)
  • +
+ + + +

Various

+ + + +
    +
  • Blueprints/json example. (#1188)
  • + + + +
  • Doc: Update 01-index.md. (#1216)
  • + + + +
  • Move DefineSiteUrlStep doc warning so it displays in documentation. (#1245)
  • + + + +
  • Updated link to native WordPress importer. (#1243)
  • + + + +
  • documentation update proposal: Provide more info on features, extensions?. (#1208)
  • + + + +
  • php-wasm/node: Update express to newest version, and move it to devDependencies. (#1218)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel @artpi @bph @brandonpayton @eliot-akira @flexseth @ironnysh @kirjavascript

+ + + +

[v0.6.14] (2024-04-11)

+ + + +

Bug Fixes

+ + + +
    +
  • Revert changes to the documentation build. (#1226)
  • +
+ + + +

Reliability

+ + + +
    +
  • Update error modal description label. (#1224)
  • +
+ + + +

Various

+ + + +
    +
  • Try memory leak workaround with zeroed mem. (#1229)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel @bgrgicak @brandonpayton

+ + + +

[v0.6.13] (2024-04-10)

+ + + +

PHP WebAssembly

+ + + +
    +
  • Try to repro memory out of bounds errors in CI. (#1199)
  • +
+ + + +

Bug Fixes

+ + + +
    +
  • Fix docs-site build. (#1222)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@bgrgicak @brandonpayton

+ + + +

[v0.6.11] (2024-04-09)

+ + + +

Tools

+ + + +
    +
  • Avoid Service Worker update issues on localhost. (#1209)
  • +
+ + + +

Import/Export

+ + + +
    +
  • importWxr: Preserve backslashes in the imported content. (#1213)
  • +
+ + + +

PHP WebAssembly

+ + + +
    +
  • Catch DNS errors to avoid unhandled exceptions. (#1215)
  • + + + +
  • Revert "Avoid partial munmap memory leak". (#1195)
  • + + + +
  • Try to repro memory out of bounds errors in CI. (#1198)
  • +
+ + + +

Various

+ + + +
    +
  • Adjust link to LICENSE file. (#1210)
  • + + + +
  • Try to reproduce the memory access error with files from 096a017. (#1212)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel @brandonpayton @emmanuel-ferdman @fluiddot

+ + + +

[v0.6.10] (2024-04-04)

+ + + +

Blueprints

+ + + +
    +
  • Rename importFile to importWxr, switch to humanmade/WordPress importer. (#1192)
  • +
+ + + +

Tools

+ + + +

Blueprints

+ + + +
    +
  • Explorations: Stream API. (#851)
  • +
+ + + +

PHP WebAssembly

+ + + +
    +
  • Avoid partial munmap memory leak. (#1189)
  • +
+ + + +

Website

+ + + +
    +
  • Make kitchen sink extension bundle the default. (#1191)
  • +
+ + + +

Bug Fixes

+ + + +
    +
  • Fix cross-device mv by switching to copy. (#846)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel @brandonpayton @seanmorris

+ + + +

[v0.6.9] (2024-04-03)

+ + + +

Tools

+ + + +
    +
  • Devex: Expose window.playground for quick testing and debugging. (#1125)
  • +
+ + + +

GitHub integration

+ + + +
    +
  • Website: Query API options to preconfigure the GitHub export form. (#1174)
  • +
+ + + +

Documentation

+ + + +
    +
  • Update the wp-cli step code example. (#1140)
  • +
+ + + +

PHP WebAssembly

+ + + +
    +
  • Add PHP iterator and yield support. (#1181)
  • + + + +
  • Fix fileinfo support. (#1179)
  • + + + +
  • Fix mbregex support. (#1155)
  • + + + +
  • PHP.run(): Throw JS exception on runtime error, remove throwOnError flag. (#1137)
  • +
+ + + +

Website

+ + + +
    +
  • Add error report modal. (#1102)
  • + + + +
  • Ensure PromiseRejectionEvent has reason before logging it. (#1150)
  • + + + +
  • Request handler: Remove everything after # from the URL. (#1126)
  • + + + +
  • Web: Make the "Apply changes" button work in Playground settings form. (#1122)
  • +
+ + + +

Plugin proxy

+ + + +
    +
  • Allow requests to WordPress.org. (#1154)
  • +
+ + + +

Internal

+ + + +
    +
  • Refresh WordPress with the latest SQLite integration plugin. (#1151)
  • +
+ + + +

Bug Fixes

+ + + +
    +
  • Fix typo in blueprints/public/schema-readme.md. (#1134)
  • + + + +
  • Priority: Fix broken link to VS Code extension. (#1141)
  • +
+ + + +

Various

+ + + +
    +
  • Docs/update - Add implied step. (#1144)
  • + + + +
  • Give brandonpayton permission to run Playground GH workflows. (#1139)
  • + + + +
  • Logger API: Add rate limiting. (#1142)
  • + + + +
  • Remove --disable-all configuration option in PHP compile process. (#1132)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel @bgrgicak @brandonpayton @flexseth @jblz @mho22

+ + + +

[v0.6.8] (2024-03-21)

+ + + +

Blueprints

+ + + +
    +
  • Allow optional metadata. (#1103)
  • +
+ + + +

Tools

+ + + +
    +
  • Add VSCode Chrome debugging support. (#1088)
  • + + + +
  • Website: Support Base64-encoding Blueprints passed in the URL. (#1091)
  • +
+ + + +

Documentation

+ + + +
    +
  • Docs: Expand Details section. (#1109)
  • + + + +
  • Update activate-theme.ts to use themeFolderName. (#1119)
  • +
+ + + +

PHP WebAssembly

+ + + +
    +
  • Blueprints: Explore switching to the PHP implementation. (#1051)
  • + + + +
  • Explore weird register_shutdown_function behavior. (#1099)
  • + + + +
  • Fix post_message_to_js memory out of bounds. (#1114)
  • + + + +
  • Fix shutdown errors. (#1104)
  • + + + +
  • Fixing build regression [BISON COMPILE]. (#871)
  • + + + +
  • PHP : Set appropriate SCRIPT variables in $_SERVER superglobal. (#1092)
  • +
+ + + +

Website

+ + + +
    +
  • Add logger API. (#1113)
  • + + + +
  • Add multisite rewrite rules. (#1083)
  • + + + +
  • Service worker: Improve error reporting in non-secure contexts. (#1098)
  • +
+ + + +

Bug Fixes

+ + + +
    +
  • Fix experimental notice in FF ESR. (#1117)
  • + + + +
  • Fix php bison dep for building on non-arm64 architectures. (#1115)
  • +
+ + + +

Reliability

+ + + +
    +
  • Add fatal errror listener. (#1095)
  • +
+ + + +

Various

+ + + +
    +
  • Update examples and demos in the documentation. (#1107)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@0aveRyan @adamziel @bgrgicak @brandonpayton @ironnysh @mho22 @seanmorris @StevenDufresne

+ + + +

[v0.6.7] (2024-03-06)

+ + + +

Website

+ + + +
    +
  • Node polyfills: Only apply them in Node.js, not in web browsers. (#1089)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel

+ + + +

[v0.6.6] (2024-03-06)

+ + + +

Website

+ + + +
    +
  • Comlink API: Pass the context argument to windowEndpoint, not wrap. (#1087)
  • + + + +
  • Fix: Playground not starting due to a race condition. (#1084)
  • + + + +
  • Hide the "This is experimental WordPress" notice on click. (#1082)
  • + + + +
  • Set the API context when using Comlink.wrap(). (#1085)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel

+ + + +

[v0.6.5] (2024-03-05)

+ + + +

Tools

+ + + +

Plugin proxy

+ + + +
    +
  • Add Sensei to the allowed repositories for plugin proxy. (#1079)
  • +
+ + + +

Blueprints

+ + + +
    +
  • Snapshot Import Protocol v1. (#1007)
  • +
+ + + +

Internal

+ + + +
    +
  • Build the php-wasm/util package as both ESM and CJS. (#1081)
  • +
+ + + +

Reliability

+ + + +

Blueprints

+ + + +
    +
  • Add unit tests to the mkdir step. (#1029)
  • +
+ + + +

Various

+ + + +
    +
  • Website query API: Continue plugin installs on error. (#605)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel @eliot-akira @reimic @renatho

+ + + +

[v0.6.4] (2024-03-04)

+ + + +

Enhancements

+ + + +
    +
  • Add logging support to Playground. (#1035)
  • +
+ + + +

Blueprints

+ + + +
    +
  • PHP Blueprints: Display progress. (#1077)
  • + + + +
  • Set progress caption and communicate failures in the import file step. (#1034)
  • +
+ + + +

Tools

+ + + +

Blueprints

+ + + +
    +
  • PHP Blueprints demo page. (#1070)
  • + + + +
  • PHP: Do not prepend a whitespace when encoding body as multipart form data. (#1033)
  • +
+ + + +

PHP WebAssembly

+ + + +
    +
  • Fix response header escaping. (#1050)
  • + + + +
  • Fix: Networking broken when extra PHP extensions are enabled. (#1045)
  • + + + +
  • PHP.wasm: Yield 0 bytes read on fd_read failure to improve PHP's fread() and feof() behavior. (#1053)
  • + + + +
  • PHP: Support $env and $cwd proc_open arguments. (#1064)
  • + + + +
  • Parse shell commands in createSpawnHandler. (#1065)
  • + + + +
  • Prototype: Spawning PHP sub-processes in Web Workers. (#1031)
  • + + + +
  • Spawning PHP sub-processes in Web Workers. (#1069)
  • +
+ + + +

Website

+ + + +
    +
  • Add Google Analytics events to Playground. (#1040)
  • + + + +
  • Fix error on reload site click. (#1041)
  • +
+ + + +

Internal

+ + + +
    +
  • Rebuild WordPress every 20 minutes, short-circuit if no new version is found. (#1061)
  • + + + +
  • Rebuild WordPress within an hour of a beta release. (#1059)
  • +
+ + + +

Bug Fixes

+ + + +
    +
  • Fix the login message so it doesn't override another. (#1044)
  • +
+ + + +

Various

+ + + +
    +
  • Add arguments to default node spawn method. (#1037)
  • + + + +
  • Add bgrgicak to deployment allowlists. (#1057)
  • + + + +
  • Allow for CORS requests to api.wordpress.org to pass. (#1009)
  • + + + +
  • Default URL rewrites to /index.php. (#1072)
  • + + + +
  • Remove repository specific Code of Conduct. (#1038)
  • + + + +
  • Ship WordPress 6.5 beta 1. (#1036)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel @bgrgicak @dd32 @desrosj @johnbillion @mho22

+ + + +

[v0.6.3] (2024-02-12)

+ + + +

Blueprints

+ + + + + + + +

PHP WebAssembly

+ + + +
    +
  • Calls proc_open two times in a row. (#1012)
  • + + + +
  • Experiment: Build PHP with OPFS support. (#1030)
  • + + + +
  • PHP: Pass request body as UInt8Array. (#1018)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel @mho22

+ + + +

[v0.6.2] (2024-02-08)

+ + + +

PHP WebAssembly

+ + + +
    +
  • Networking: Swap Requests transports using the http_api_transports instead of patching the Requests library. (#1004)
  • + + + +
  • Remove crypto.randomUUID dependency in favor of a custom function. (#1016)
  • + + + +
  • Remove x-request-issuer header on cross-origin requests. (#1010)
  • + + + +
  • Update wp_http_fetch.php. (#1002)
  • +
+ + + +

Website

+ + + +
    +
  • Remote.html: Always install the playground mu-plugin. (#1005)
  • +
+ + + +

Various

+ + + +
    +
  • 32bit integer workaround. (#1014)
  • + + + +
  • Test/hello world blueprint. (#908)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel @bgrgicak @jdevalk @sejas @stoph

+ + + +

[v0.6.1] (2024-02-05)

+ + + +

Website

+ + + +

Blueprints

+ + + +
    +
  • Remove the applyWordPressPatches step, enable the Site Health Plugin. (#1001)
  • +
+ + + +

Various

+ + + +
    +
  • Add crypto to Polyfills improving Blueprint compatibility for Node. (#1000)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel @sejas

+ + + +

[v0.6.0] (2024-02-05)

+ + + +

Enhancements

+ + + +
    +
  • Add wp-cli and code editor examples to the demos page. (#965)
  • + + + +
  • WordPress: Preserve PHP attributes and wp-config.php whitespace. (#964)
  • +
+ + + +

Blueprints

+ + + +
    +
  • Add enableMultisite step. (#888)
  • + + + +
  • Set_current_user to admin before activating plugins and themes. (#984)
  • +
+ + + +

Tools

+ + + +
    +
  • Use .zip files instead of .data files for loading WordPress. (#978)
  • +
+ + + +

Blueprints

+ + + +
    +
  • Throw on failure. (#982)
  • +
+ + + +

PHP WebAssembly

+ + + +
    +
  • Support wp-cli in the browser. (#957)
  • +
+ + + +

PHP WebAssembly

+ + + +
    +
  • Correcting OOB & Prevent Crash on Saving Large Post. (#870)
  • + + + +
  • Memory leak: Add rotatedPHP to kill and recreate PHP instances after a certain number of requests. (#990)
  • + + + +
  • PHP : Add args and descriptors dynamic arrays in proc open function. (#969)
  • + + + +
  • PHP.wasm: Fix stack overflow in wasm_set_request_body. (#993)
  • +
+ + + +

Website

+ + + +
    +
  • Add .htaccess file to prevent caching of index.html and enable importing the client.js library. (#989)
  • + + + +
  • Add og meta tags and meta description. (#980)
  • + + + +
  • CORS headers for client/index.js. (#893)
  • + + + +
  • wp-cli: Respect quotes when parsing shell commands. (#966)
  • +
+ + + +

Internal

+ + + +
    +
  • Remove the interactive block playground. (#988)
  • +
+ + + +

Bug Fixes

+ + + +
    +
  • Fix "WP-CLI" typos. (#971)
  • + + + +
  • Fix footer styling issue in the "Code is Poetry" in wordpress.github.io/wordpress-playground. (#959)
  • + + + +
  • WordPress build: Add newlines after PHP annotations. (#986)
  • +
+ + + +

Various

+ + + +
    +
  • Add a blueprint example. (#946)
  • + + + +
  • Add terminal to playground site. (#161)
  • + + + +
  • Match the .nvmrc node version to the changes made in commit ec2605b. (#972)
  • + + + +
  • PHP : Dispatch available descriptor specs in js_open_process function. (#963)
  • + + + +
  • PHP : Give access to command arguments if array type is given in php ^7.4 proc_open function. (#944)
  • + + + +
  • Rebuild WordPress. (#987)
  • + + + +
  • Update the networking disabled error messages in wp-admin for plugins and themes. (#936)
  • +
+ + + +

Contributors

+ + + +

The following contributors merged PRs in this release:

+ + + +

@adamziel @bph @ironnysh @marcarmengou @mho22 @rowasc @seanmorris @swissspidy @tyrann0us

+ + + +

[v0.5.9] - 2021-09-29

+ + + +

Changed

+ + + +

Breaking: Remoddsaved the PHPBrowser class (##1302)

+ + + +

Added

+ + + +

– Added CHANGELOG.md to keep track of notable changes (##1302)

+]]>
+ + 2 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + open + open + + publish + 0 + 0 + page + + 0 +
+ + <![CDATA[08-query-api]]> + http://127.0.0.1:9400/?page_id=3 + + admin + + + +

Query API

+ + + +

WordPress Playground exposes a simple API that you can use to configure the Playground in the browser.

+ + + +

It works by passing configuration options as query parameters to the Playground URL. For example, to install the pendant theme, you would use the following URL:

+ + + +
https://playground.wordpress.net/?theme=pendant
+
+ + + +

You can go ahead and try it out. The Playground will automatically install the theme and log you in as an admin. You may even embed this URL in your website using an <iframe> tag:

+ + + +

+
+ + + +

:::info CORS policy

+ + + +

To import files from a URL, such as a site zip package, they must be served with Access-Control-Allow-Origin header set. For reference, see: Cross-Origin Resource Sharing (CORS).

+ + + +

:::

+ + + +

GitHub Export Options

+ + + +

The following additional query parameters may be used to pre-configure the GitHub export form:

+ + + +
    +
  • gh-ensure-auth: If set to yes, Playground will display a modal to ensure the
    user is authenticated with GitHub before proceeding.
  • + + + +
  • ghexport-repo-url: The URL of the GitHub repository to export to.
  • + + + +
  • ghexport-pr-action: The action to take when exporting (create or update).
  • + + + +
  • ghexport-playground-root: The root directory in the Playground to export from.
  • + + + +
  • ghexport-repo-root: The root directory in the repository to export to.
  • + + + +
  • ghexport-content-type: The content type of the export (plugin, theme, wp-content, custom-paths).
  • + + + +
  • ghexport-plugin: Plugin path. When the content type is plugin, pre-select the plugin to export.
  • + + + +
  • ghexport-theme: Theme directory name. When the content type is theme, pre-select the theme to export.
  • + + + +
  • ghexport-path: A path relative to ghexport-playground-root. Can be provided multiple times. When the
    content type is custom-paths, it pre-populates the list of paths to export.
  • + + + +
  • ghexport-commit-message: The commit message to use when exporting.
  • + + + +
  • ghexport-allow-include-zip: Whether to offer an option to include a zip file in the GitHub
    export (yes, no). Optional. Defaults to yes.
  • +
+]]>
+ + 3 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + open + open + + publish + 0 + 0 + page + + 0 +
+ + <![CDATA[01-start-here]]> + http://127.0.0.1:9400/?page_id=4 + + admin + + + +

WordPress Playground

+ + + +

:::info Looking for the official Playground website?

+ + + +

WordPress Playground website moved to wordpress.org/playground/. The site you're at now hosts the documentation.

+ + + +

:::

+ + + +

👋 Hi! Welcome to WordPress Playground documentation. Playground is an online tool to experiment and learn about WordPress – learn more in the overview section.

+ + + +

The documentation consists of two major sections:

+ + + +
    +
  • Documentation (you're here) – Introduction, concepts, and guides
  • + + + +
  • API reference – All the APIs exposed by WordPress Playground
  • +
+ + + +

This site (Documentation) is where you will find all the information you need to start using Playground. To learn more about what this fantastic tool, read Introduction to Playground: running WordPress in the browser

+ + + +

Quick start

+ + + + + + + +

Take a deep dive

+ + + +

import APIList from '@site/docs/_fragments/_api_list.mdx';

+ + + + + + + +

Get Involved

+ + + +

WordPress Playground is an open-source project and welcomes all contributors from code to design, and from documentation to triage. Don't worry, you don't need to know WebAssembly to contribute!

+ + + + + + + +

As with all WordPress projects, we want to ensure a welcoming environment for everyone. With that in mind, all contributors are expected to follow our Code of Conduct.

+ + + +

License

+ + + +

WordPress Playground is free software, and is released under the terms of the GNU General Public License version 2 or (at your option) any later version. See LICENSE.md. for complete license.

+ + + +

<br/><br/><p align="center"><img src="https://s.w.org/style/images/codeispoetry.png?1" alt="Code is Poetry." /></p>

+]]>
+ + 4 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + open + open + + publish + 0 + 0 + page + + 0 +
+ + <![CDATA[02-start-using]]> + http://127.0.0.1:9400/?page_id=5 + + admin + + + +

import ThisIsQueryApi from '@site/docs/_fragments/_this_is_query_api.md';

+ + + +

Start using WordPress Playground in 5 minutes

+ + + +

WordPress Playground can help you with any of the following:

+ + + +

import TOCInline from '@theme/TOCInline';

+ + + + + + + +

This page will guide you through each of these. Oh, and if you're a visual learner – here's a video:

+ + + + + + + +

Start a new WordPress site

+ + + +

Every time you visit the official demo on playground.wordpress.net, you get a fresh WordPress site.

+ + + +

You can then create pages, upload plugins, themes, import your own site, and do most things you would do on a regular WordPress.

+ + + +

It's that easy to start!

+ + + +

The entire site lives in your browser and is scraped when you close the tab. Want to start over? Just refresh the page!

+ + + +

:::info WordPress Playground is private

+ + + +

Everything you build stays in your browser and is not sent anywhere. Once you're finished, you can export your site as a zip file. Or just refresh the page and start over!

+ + + +

:::

+ + + +

Try a block, a theme, or a plugin

+ + + +

You can upload any plugin or theme you want in /wp-admin/.

+ + + +

To save a few clicks, you can preinstall plugins or themes from the WordPress plugin directory by adding a plugin or theme parameter to the URL. For example, to install the coblocks plugin, you can use this URL:

+ + + +

https://playground.wordpress.net/?plugin=coblocks

+ + + +

Or this URL to preinstall the pendant theme:

+ + + +

https://playground.wordpress.net/?theme=pendant

+ + + +

You can also mix and match these parameters and even add multiple plugins:

+ + + +

https://playground.wordpress.net/?plugin=coblocks&plugin=friends&theme=pendant

+ + + + + + + +

:::info Plugin directory doesn't work in WordPress Playground

+ + + +

Plugins must be installed manually because your WordPress site doesn't send any data to the internet. You won't be able to navigate the WordPress plugin directory inside /wp-admin/. The Query API method may seem to contradict that, but behind the scenes it uses the same plugin upload form as you would.

+ + + +

:::

+ + + +

Save your site

+ + + +

To keep your WordPress Playground site for longer than a single browser session, you can export it as a zip file.

+ + + +

Use the "Export" button in the top bar:

+ + + +
Export button
+ + + +

The exported file contains the complete site you've built. You could host it on any server that supports PHP and SQLite. All WordPress core files, plugins, themes, and everything else you've added to your site are in there.

+ + + +

The SQLite database file is also included in the export, you'll find it wp-content/database/.ht.sqlite. Keep in mind that files starting with a dot are hidden by default on most operating systems so you might need to enable the "Show hidden files" option in your file manager.

+ + + +

Restore a saved site

+ + + +

You can restore the site you saved by using the import button in WordPress Playground:

+ + + +
Import button
+ + + +

Use a specific WordPress or PHP version

+ + + +

The easiest way is to use the version switcher on the official demo site:

+ + + +
WordPress Version switcher
+ + + +

:::info Test your plugin or theme

+ + + +

Compatibility testing with so many WordPres and PHP versions was always a pain. WordPress Playground makes this process effortless – use it to your advantage!

+ + + +

:::

+ + + +

You can also use the wp and php query parameters to open Playground with the right versions already loaded:

+ + + +
    +
  • https://playground.wordpress.net/?wp=6.5
  • + + + +
  • https://playground.wordpress.net/?php=7.4
  • + + + +
  • https://playground.wordpress.net/?php=8.2&wp=6.2
  • +
+ + + + + + + +

:::info Major versions only

+ + + +

You can specify major versions like wp=6.2 or php=8.1 and expect the most recent release in that line. You cannot, however, request older minor versions so neither wp=6.1.2 nor php=7.4.9 will work.

+ + + +

:::

+ + + +

Import a WXR file

+ + + +

You can import a WordPress export file by uploading a WXR file in /wp-admin/.

+ + + +

You can also use JSON Blueprints. See getting started with Blueprints to learn more.

+ + + +

This is different from the import feature described above. The import feature exports the entire site, including the database. This import feature imports a WXR file into an existing site.

+ + + +

Build apps with WordPress Playground

+ + + +

WordPress Playground is programmable which means you can build WordPress apps, setup plugin demos, and even use it as a zero-setup local development environment.

+ + + +

To learn more about developing with WordPress Playground, check out the development quick start section.

+]]>
+ + 5 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + open + open + + publish + 0 + 0 + page + + 0 +
+ + <![CDATA[12-limitations]]> + http://127.0.0.1:9400/?page_id=6 + + admin + + + +

Limitations

+ + + +

WordPress Playground is under active development and has some limitations you should keep in mind when running it and developing with it.

+ + + +

You can track the status of these issues on the Playground Project board.

+ + + +

In the browser

+ + + +

Access the Plugins, Themes, Blocks, or Patterns directories

+ + + +

Playground disables network connections by default, blocking access to wp.org assets (themes, plugins, blocks, or patterns) in wp-admin. You can still upload zipped plugin and theme files from your device or enable the option via the Query API or Blueprints API.

+ + + +

Temporary by design

+ + + +

As Playground streams rather than serves WordPress, all database changes and uploads will be gone when you refresh the page. To avoid losing your work, either export your work before or enable storage in the browser/device via the Query API or the UI.

+ + + +

When developing with Playground

+ + + +

Iframe quirks

+ + + +

Playground renders WordPress in an iframe so clicking links with target="_top" will reload the page you’re working on.
Also, JavaScript popups originating in the iframe may not always display.

+ + + +

Run WordPress PHP functions

+ + + +

Playground supports running PHP code in Blueprints using the runPHP step. To run WordPress-specific PHP functions, you’d need to first require wp-load.php:

+ + + +
{
+	"step": "runPHP",
+	"code": ""
+}
+
+ + + +

Using WP-CLI

+ + + +

You can execute wp-cli commands via the Blueprints wp-cli step. However, since Playground runs in the browser, it doesn't support the full array of available commands. While there is no definite list of supported commands, experimenting in the online demo will help you assess what's possible.

+]]>
+ + 6 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + open + open + + publish + 0 + 0 + page + + 0 +
+ + <![CDATA[02-code.blockhtml]]> + http://127.0.0.1:9400/?page_id=7 + + admin + + + +

Code contributions

+ + + +

Like all WordPress projects, Playground uses GitHub to manage code and track issues. The main repository is at https://github.com/WordPress/wordpress-playground and the Playground Tools repository is at https://github.com/WordPress/playground-tools/.

+ + + +

:::info Contribute to Playground Tools

+ + + +

This guide includes links to the main repository, but all the steps and options apply for both. If you're interested in the plugins or local development tools—start there.

+ + + +

:::

+ + + +

Browse the list of open issues to find what to work on. The Good First Issue label is a recommended starting point for first-time contributors.

+ + + +

Be sure to review the following resources before you begin:

+ + + + + + + +

Contribute Pull Requests

+ + + +

Fork the Playground repository and clone it to your local machine. To do that, copy and paste these commands into your terminal:

+ + + +
git clone -b trunk --single-branch --depth 1
+
+# replace `YOUR-GITHUB-USERNAME` with your GitHub username:
+git@github.com:YOUR-GITHUB-USERNAME/wordpress-playground.git
+cd wordpress-playground
+npm install
+
+ + + +

Create a branch, make changes, and test it locally by running the following command:

+ + + +
npm run dev
+
+ + + +

Playground will open in a new browser tab and refresh automatically with each change.

+ + + +

When your'e ready, commit the changes and submit a Pull Request.

+ + + +

:::info Formatting

+ + + +

We handle code formatting and linting automatically. Relax, type away, and let the machines do the work.

+ + + +

:::

+ + + +

Running a local Multisite

+ + + +

WordPress Multisite has a few restrictions when run locally. If you plan to test a Multisite network using Playground's enableMultisite step, make sure you either change wp-now's default port or set a local test domain running via HTTPS.

+ + + +

To change wp-now's default port to the one supported by WordPress Multisite, run it using the --port=80 flag:

+ + + +
npx @wp-now/wp-now start --port=80
+
+ + + +

There are a few ways to set up a local test domain, including editing your hosts file. If you're unsure how to do that, we suggest installing Laravel Valet and then running the following command:

+ + + +
valet proxy playground.test http://localhost:5400 --secure
+
+ + + +

Your dev server is now available on https://playground.test.

+ + + +

Debugging

+ + + +

Use VS Code and Chrome

+ + + +

If you're using VS Code and have Chrome installed, you can debug Playground in the code editor:

+ + + +
    +
  • Open the project folder in VS Code.
  • + + + +
  • Select Run > Start Debugging from the main menu or press F5/fn+F5.
  • +
+ + + +

Debugging PHP

+ + + +

Playground logs PHP errors in the browser console after every PHP request.

+]]>
+ + 7 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + open + open + + publish + 0 + 0 + page + + 0 +
+ + <![CDATA[03-build-an-app]]> + http://127.0.0.1:9400/?page_id=8 + + admin + + + +

Build an app with WordPress Playground in 5 minutes

+ + + +

WordPress Playground was created as a programmable tool. Below you'll find a few examples of what you can do with it. Each discussed API is described in detail in the APIs section:

+ + + +

import TOCInline from '@theme/TOCInline';

+ + + + + + + +

Embed WordPress on your website

+ + + +

Playground can be embedded on your website using the HTML <iframe> tag as follows:

+ + + +

+
+
+
+

VS Code Playground extension

+ + + +

The Visual Studio Code Playground extension is a friendly zero-setup development environment.

+ + + +
    +
  1. Open VS Code and navigate to the Extensions tab (View > Extensions).
  2. + + + +
  3. In the search bar, type WordPress Playground and click Install.
  4. + + + +
  5. To interact with Playground, click the new icon in the Activity Bar and hit the Start WordPress Server button.
  6. + + + +
  7. A new tab will open in your browser within seconds.
  8. +
+ + + +

wp-now NPM package

+ + + +

@wp-now/wp-now is a CLI tool that allows you to spin up a WordPress site with a single command. No Docker, MySQL, or Apache are required.

+ + + +

Prerequisites

+ + + +

wp-now requires Node.js and NPM. If you haven’t yet, download and install both before you begin.

+ + + +

Depending on the Make WordPress team you contribute to, you may need a different Node.js version than the one you have installed. You can use Node Version Manager (NVM) to switch between versions. Find the installation guide here.

+ + + +

Run wp-now

+ + + +

You don’t have to install wp-now on your device to use it. Navigate to your plugin or theme directory and start wp-now with the following commands:

+ + + +
cd my-plugin-or-theme-directory
+npx @wp-now/wp-now start
+
+ + + +

Ideas for contributors

+ + + +

Create a Gutenberg Pull Request (PR)

+ + + +
    +
  1. Fork the Gutenberg repository in your GitHub account.
  2. + + + +
  3. Then, clone the forked repository to download the files.
  4. + + + +
  5. Install the necessary dependencies and build the code in development mode.
  6. +
+ + + +
git clone git@github.com:WordPress/gutenberg.git
+cd gutenberg
+npm install
+npm run dev
+
+ + + +

:::info

+ + + +

If you’re unsure about the steps listed above, visit the official Gutenberg Project Contributor Guide. Note that in this case, wp-now replaces wp-env.

+ + + +

:::

+ + + +

Open a new terminal terminal tab, navigate to the Gutenberg directory, and start WordPress using wp-now:

+ + + +
cd gutenberg
+npx @wp-now/wp-now start
+
+ + + +

When you’re ready, commit and push your changes to your forked repository on GitHub and open a Pull Request on the Gutenberg repository.

+ + + +

Test a Gutenberg PR

+ + + +
    +
  1. To test other Gutenberg PRs, checkout the branch associated with it.
  2. + + + +
  3. Pull the latest changes to ensure your local copy is up to date.
  4. + + + +
  5. Next, install the necessary dependencies, ensuring your testing environment matches the latest changes.
  6. + + + +
  7. Finally, build the code in development mode.
  8. +
+ + + +
# copy the branch-name from GitHub #
+git checkout branch-name
+git pull
+npm install
+npm run dev
+
+# In a different terminal inside the Gutenberg directory *
+npx @wp-now/wp-now start
+
+ + + +

Test a Gutenberg PR with Playground in the browser

+ + + +

You don’t need a local development environment to test Gutenberg PRs—use Playground to do it directly in the browser.

+ + + +
    +
  1. Copy the ID of the PR you’d like to test (pick one from the list of open Pull Requests).
  2. + + + +
  3. Open Playground’s Gutenberg PR Previewer and paste the ID you copied.
  4. + + + +
  5. Once you click Go, Playground will verify the PR is valid and open a new tab with the relevant PR, allowing you to review the proposed changes.
  6. +
+ + + +

Translate WordPress Plugins with Playground in the browser

+ + + +

You can translate supported WordPress Plugins by loading the plugin you want to translate and use Inline Translation. If the plugin developers have added the option, you'll find the Translate Live link on the top right toolbar of the translation view. You can read more about this exciting new option on this Polyglots blog post.

+ + + +

Get help and contribute to WordPress Playground

+ + + +

Have a question or an idea for a new feature? Found a bug? Something’s not working as expected? We’re here to help:

+ + + + +]]> + + 28 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + open + open + + publish + 0 + 0 + page + + 0 + + + <![CDATA[08-browser-concepts.blockhtml]]> + http://127.0.0.1:9400/?page_id=29 + + admin + + + +

Running PHP apps in the browser with ServiceWorkers and Worker Threads

+ + + +

On a high level, WordPress Playground works in web browsers as follows:

+ + + +
    +
  • The index.html file on playground.wordpress.net loads the remote.html file via an <iframe src="/remote.html">.
  • + + + +
  • remote.html starts a Worker Thread and a ServiceWorker and sends back the download progress information.
  • + + + +
  • The Worker Thread starts PHP and populates the filesystem with a WordPress patched to run on SQLite.
  • + + + +
  • The ServiceWorker starts intercepting all HTTP requests and forwarding them to the Worker Thread.
  • + + + +
  • remote.html creates an <iframe src="/index.php">, and the Service Worker forwards the index.php request to the Worker Thread where the WordPress homepage is rendered.
  • +
+ + + +

Visually, it looks like this:

+ + + +
Architecture overview
+ + + +

High-level ideas

+ + + +

The @php-wasm/web is built on top of the following ideas:

+ + + +
    +
  • Browser tab orchestrates everything – The browser tab is the main program. Closing or reloading it means destroying the entire execution environment.
  • + + + +
  • Iframe-based rendering – Every response produced by the PHP server must be rendered in an iframe to avoid reloading the browser tab when the user clicks on a link.
  • + + + +
  • PHP Worker Thread – The PHP server is slow and must run in a web worker, otherwise handling requests freezes the website UI.
  • + + + +
  • Service Worker routing – All HTTP requests originating in that iframe must be intercepted by a Service worker and passed on to the PHP worker thread for rendering.
  • +
+]]>
+ + 29 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + open + open + + publish + 0 + 0 + page + + 0 +
+ + <![CDATA[03-coding-standards.blockhtml]]> + http://127.0.0.1:9400/?page_id=30 + + admin + + + +

Coding principles

+ + + +

Error messages

+ + + +

A good error message tells the user what to do next. Any ambiguity in errors thrown by Playground public APIs will prompt the developers to open issues.

+ + + +

Consider a network error, for example—can we infer the type of error and display a relevant message summarizing the next steps?

+ + + +
    +
  • Network error: "Your internet connection twitched. Try to reload the page.
  • + + + +
  • 404: "Could not find the file".
  • + + + +
  • 403: "The server blocked access to the file".
  • + + + +
  • CORS: clarify it's a browser security feature and add a link to a detailed explanation (on MDN or another reliable source). Suggest the user move their file somewhere else, like raw.githubusercontent.com, and link to a resource explaining how to set up CORS headers on their servers.
  • +
+ + + +

We handle code formatting and linting automatically. Relax, type away, and let the machines do the work.

+ + + +

Public API

+ + + +

Playground aims to keep the narrowest possible API scope.

+ + + +

Public APIs are easy to add and hard to remove. It only takes one PR to introduce a new API, but it may take a thousand to remove it, especially if other projects have already consumed it.

+ + + +
    +
  • Don't expose unnecessary function, class, constant, or other components.
  • +
+ + + +

Blueprints

+ + + +

Blueprints are the primary way to interact with Playground. These JSON files describe a set of steps that Playground executes in order.

+ + + +

Guidelines

+ + + +

Blueprint steps should be concise and focused. They should do one thing and do it well.

+ + + +
    +
  • If you need to create a new step, try refactoring an existing one first.
  • + + + +
  • If that's not enough, ensure the new step delivers a new capability. Don't replicate the functionality of existing steps.
  • + + + +
  • Assume the step would be called more than once.
  • + + + +
  • Assume it would run in a specific order.
  • + + + +
  • Add unit tests to verify that.
  • +
+ + + +

Blueprints should be intuitive and straightforward.

+ + + +
    +
  • Don't require arguments that can be optional.
  • + + + +
  • Use plain argument. For example, slug instead of path.
  • + + + +
  • Define constants in virtual JSON files—don't modify PHP files.
  • + + + +
  • Define a TypeScript type for the Blueprint. That's how Playground generates its JSON schema.
  • + + + +
  • Write a function to handle a Blueprint step. Accept the argument of the type you defined.
  • + + + +
  • Provide a usage example in the doc string. It's automatically reflected in the docs.
  • +
+]]>
+ + 30 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + open + open + + publish + 0 + 0 + page + + 0 +
+ + <![CDATA[02-wasm-php-overview.blockhtml]]> + http://127.0.0.1:9400/?page_id=31 + + admin + + + +

WebAssembly PHP

+ + + +

WordPress Playground build the PHP interpreter to WebAssembly using Emscripten and a dedicated pipeline.

+ + + +
Building C programs to WebAssembly
+ + + +

Building PHP to WebAssembly is very similar to building vanilla PHP. The wasm build required adjusting a function signature here, forcing a config variable there, and applying a few small patches, but there's relatively few adjustments involved.

+ + + +
Building PHP to WebAssembly
+ + + +

However, vanilla PHP builds aren't very useful in the browser. As a server software, PHP doesn't have a JavaScript API to pass the request body, upload files, or populate the php://stdin stream. WordPress Playground had to build one from scratch. The WebAssembly binary comes with a dedicated PHP API module written in C and a JavaScript PHP class that exposes methods like writeFile() or run().

+ + + +

Because every PHP version is just a static .wasm file, the PHP version switcher is actually pretty boring. It simply tells the browser to download, for example, php_7_3.wasm instead of, say, php_8_2.wasm.

+ + + +
Building different versions of PHP to WebAssembly
+ + + +

Networking support varies between platforms

+ + + +

When it comes to networking, WebAssembly programs are limited to calling JavaScript APIs. It is a safety feature, but also presents a challenge. How do you support low-level, synchronous networking code used by PHP with the high-level asynchronous APIs available in JavaScript?

+ + + +

In Node.js, the answer involves a WebSocket to TCP socket proxy, Asyncify, and patching deep PHP internals like php_select. It's complex, but there's a reward. The Node.js-targeted PHP build can request web APIs, install composer packages, and even connect to a MySQL server.

+ + + +

In the browser, networking is supported to a limited extent. Network calls initiated using wp_safe_remote_get, like the ones in the plugin directory or the font library, are translated into fetch() calls and succeed if the remote server sends the correct CORS headers. However, a full support for arbitrary HTTPS connection involves opening a raw TCP socket which is not possible in the browser. There is an open GitHub issue that explores possible ways of addressing this problem.

+]]>
+ + 31 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + open + open + + publish + 0 + 0 + page + + 0 +
+ + <![CDATA[17-browser-wordpress.blockhtml]]> + http://127.0.0.1:9400/?page_id=32 + + admin + + + +

Bundling WordPress for the browser

+ + + +

The web bundler Dockerfile turns a vanilla WordPress into a browser-optimized one:

+ + + +
    +
  • Makes WordPress run on SQLite using the official drop-in plugin as MySQL is unsupported in the browser.
  • + + + +
  • Reduces the WordPress website size from about 70MB to about 10MB, or 5MB compressed.
  • + + + +
  • Runs the WordPress installation wizard.
  • + + + +
  • Bundles WordPress as a data dependency
  • +
+ + + +

Build a new bundle with nx bundle-wordpress playground-wordpress-builds --wp-version=<version>, e.g.:

+ + + +
nx bundle-wordpress playground-wordpress-builds --wp-version=6.1
+
+ + + +

The bundler outputs:

+ + + +
    +
  • packages/playground/wordpress-builds/public/wp-6.1.zip – zipped WordPress files
  • + + + +
  • packages/playground/wordpress-builds/public/wp-6.1/ – a directory with static assets for the specified WordPress versions
  • +
+ + + +

Consult the web bundler Dockerfile for more details (like the list of supported WordPress versions) and modify it to customize the default WordPress installation.

+]]>
+ + 32 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + open + open + + publish + 0 + 0 + page + + 0 +
+ + <![CDATA[03-php-wasm-node.blockhtml]]> + http://127.0.0.1:9400/?page_id=33 + + admin + + + +

Using WordPress Playground in Node.js

+ + + +

As a WebAssembly project, you can also use WordPress Playground in Node.js.

+ + + +

If you need low-level control over the underlying WebAssembly PHP build, take a look at the @php-wasm/node package which ships the PHP WebAssembly runtime. This package is at the core of all WordPress Playground tools for Node.js.

+ + + +

:::info API reference

+ + + +

Consult the complete list of Classes, Functions, Interfaces, and Type Aliases.

+ + + +

:::

+ + + +

import PHPWASMNode from '@php-wasm/node/\README.md';

+ + + + +]]>
+ + 33 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + 0000-00-00 00:00:00 + open + open + + publish + 0 + 0 + page + + 0 +
+ + <![CDATA[02-using-blueprints.blockhtml]]> + http://127.0.0.1:9400/?page_id=34 + + admin + + + +

Using Blueprints

+ + + +

You can use Blueprints in one of two ways:

+ + + +
    +
  • By passing them as a URL fragment to the Playground.
  • + + + +
  • By using the JavaScript API.
  • +
+ + + +

URL Fragment

+ + + +

The easiest way to start using Blueprints is to paste one into the URL "fragment" on WordPress Playground website, e.g. https://playground.wordpress.net/#{"preferredVersions....

+ + + +

For example, to create a Playground with specific versions of WordPress and PHP you would use the following Blueprint:

+ + + +
{
+	"$schema": "https://playground.wordpress.net/blueprint-schema.json",
+	"preferredVersions": {
+		"php": "7.4",
+		"wp": "6.5"
+	}
+}
+
+ + + +

And then you would go to
https://playground.wordpress.net/#{"preferredVersions": {"php":"7.4", "wp":"6.5"}}.

+ + + +

You won't have to paste links to follow along. We'll use code examples with a "Try it out" button that will automatically run the examples for you:

+ + + +

import BlueprintExample from '@site/src/components/Blueprints/BlueprintExample.mdx';

+ + + +

"preferredVersions": {
"php": "7.4",
"wp": "6.5"
}
}} />

+ + + +

Base64 encoded Blueprints

+ + + +

Some tools, including GitHub, might not format the Blueprint correctly when pasted into the URL. In such cases, encode your Blueprint in Base64 and append it to the URL. For example, that's the above Blueprint in Base64 format: eyIkc2NoZW1hIjogImh0dHBzOi8vcGxheWdyb3VuZC53b3JkcHJlc3MubmV0L2JsdWVwcmludC1zY2hlbWEuanNvbiIsInByZWZlcnJlZFZlcnNpb25zIjogeyJwaHAiOiAiNy40Iiwid3AiOiAiNi41In19.

+ + + +

To run it, go to https://playground.wordpress.net/#eyIkc2NoZW1hIjogImh0dHBzOi8vcGxheWdyb3VuZC53b3JkcHJlc3MubmV0L2JsdWVwcmludC1zY2hlbWEuanNvbiIsInByZWZlcnJlZFZlcnNpb25zIjogeyJwaHAiOiAiNy40Iiwid3AiOiAiNi41In19

+ + + +

Load Blueprint from a URL

+ + + +

When your Blueprint gets too wieldy, you can load it via the ?blueprint-url query parameter in the URL, like this:

+ + + +

https://playground.wordpress.net/?blueprint-url=https://raw.githubusercontent.com/adamziel/blueprints/trunk/blueprints/latest-gutenberg/blueprint.json

+ + + +

Note that the Blueprint must be publicly accessible and served with the correct Access-Control-Allow-Origin header:

+ + + +
Access-Control-Allow-Origin: *
+
+ + + +

JavaScript API

+ + + +

You can also use Blueprints with the JavaScript API using the startPlaygroundWeb() function from the @wp-playground/client package. Here's a small, self-contained example you can run on JSFiddle or CodePen:

+ + + +