diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 218511f..30ca994 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -12,6 +12,10 @@ jobs: matrix: php: ['8.1', '8.2', '8.3', '8.4'] stability: [prefer-lowest, prefer-stable] + include: + - php: '8.5' + flags: "--ignore-platform-req=php" + stability: prefer-stable steps: - name: Checkout code uses: actions/checkout@v4 @@ -45,11 +49,16 @@ jobs: - name: Run Unit tests with coverage run: composer phpunit -- ${{ matrix.phpunit-flags }} + if: ${{ matrix.php != '8.5' }} + + - name: Run Unit tests without coverage + run: vendor/bin/phpunit --no-coverage + if: ${{ matrix.php == '8.5' }} - name: Run static analysis run: composer phpstan - if: ${{ matrix.php == '8.3' && matrix.stability == 'prefer-stable'}} + if: ${{ matrix.php == '8.4' && matrix.stability == 'prefer-stable'}} - name: Run Coding style rules run: composer phpcs:fix - if: ${{ matrix.php == '8.3' && matrix.stability == 'prefer-stable'}} + if: ${{ matrix.php == '8.4' && matrix.stability == 'prefer-stable'}} diff --git a/.php-cs-fixer.php b/.php-cs-fixer.php index 79865d7..28ec97a 100644 --- a/.php-cs-fixer.php +++ b/.php-cs-fixer.php @@ -7,6 +7,7 @@ $config = new PhpCsFixer\Config(); return $config + ->setUnsupportedPhpVersionAllowed(true) ->setRules([ '@PSR12' => true, 'array_syntax' => ['syntax' => 'short'], diff --git a/CHANGELOG.md b/CHANGELOG.md index c72e73d..9bc34e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,12 +2,23 @@ All Notable changes to `bakame/html-table` will be documented in this file. +# [Next](https://github.com/bakame-php/html-table/compare/0.5.0...main) - TBD + +## What's Changed + +* **BC BREAK:** the `Table` class now implements the `TabularDataProvider` instead of the `TabularDataReader` interface. +* **BC BREAK:** the `ParserError` class now extends the `Exception` instead of the `InvalidArgumentException` exception class. +* **BC BREAK:** the `Parser::withoutFormatter` is deprecated; use `Parser::withFormatter(null)` instead. +* The `Parser::new` static constructor is deprecated use the default constructor instead. +* Boolean parameters are now replaced by `Enum` for maintenance and readability. +* Adding `SplFileInfo` to `parseHTML` and `parseFile` + # [0.5.0](https://github.com/bakame-php/html-table/compare/0.4.0...0.5.0) - 2025-07-06 ## What's Changed * Upgrade dependencies on `aide-error` to version `0.2.0` -* fix use statement by @tacman in https://github.com/bakame-php/html-table/pull/6 +* fix use statements by @tacman in https://github.com/bakame-php/html-table/pull/6 ## New Contributors diff --git a/README.md b/README.md index 22f3583..aeaf4d3 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ [![Sponsor development of this project](https://img.shields.io/badge/sponsor%20this%20package-%E2%9D%A4-ff69b4.svg?style=flat-square)](https://github.com/sponsors/nyamsprod) `bakame/html-table` is a small PHP package that allows you to parse, import and manipulate -tabular data represented as HTML Table. Once installed you will be able to do the following: +tabular data represented as HTML Table. Once installed, you will be able to do the following: ```php use Bakame\TabularData\HtmlTable\Parser; @@ -18,6 +18,7 @@ $table = Parser::new() ->parseFile('https://www.bbc.com/sport/football/tables'); $table + ->getTabularData() ->filter(fn (array $row) => (int) $row['points'] >= 10) ->sorted(fn (array $rowA, array $rowB) => (int) $rowB['for'] <=> (int) $rowA['for']) ->fetchPairs('team', 'for'); @@ -35,7 +36,7 @@ $table ## System Requirements -**league\csv 9.23.0** library is required. (since version 0.4.0). +**league\csv 9.25.0** library is required. (since version 0.6.0). ## Installation @@ -62,7 +63,6 @@ use Bakame\HtmlTable\Parser; $parser = Parser::new() ->ignoreTableHeader() ->ignoreXmlErrors() - ->withoutFormatter() ->tableCaption('This is a beautiful table'); ``` @@ -74,7 +74,7 @@ If parsing is not possible a `ParseError` exception will be thrown. ```php use Bakame\HtmlTable\Parser; -$parser = Parser::new(); +$parser = new Parser(); $table = $parser->parseHtml('...
'); $table = $parser->parseFile('path/to/html/file.html'); @@ -129,24 +129,25 @@ $html = << HTML; -$table = Parser::new()->parseHtml($html); +$table = (new Parser())->parseHtml($html); $table->getCaption(); //returns 'Songs' $table->getHeader(); //returns ['Title','Singer', 'Country'] -$table->nth(2); //returns ["Title" => "Nzinzi", "Singer" => "Emeneya", "Country" => "DR Congo"] -json_encode($table->slice(0, 1)); -//{"caption":"Songs","header":["Title","Singer","Country"],"rows":[{"Title":"Nakei Nairobi","Singer":"Mbilia Bel","Country":"DR Congo"}]} +$tableData = $table->geTabularData(); +$tableData->nth(2); //returns ["Title" => "Nzinzi", "Singer" => "Emeneya", "Country" => "DR Congo"] +json_encode($tableData->slice(0, 1)); +//[{"Title":"Nakei Nairobi","Singer":"Mbilia Bel","Country":"DR Congo"}] ``` #### Default configuration -By default, when calling the `Parser::new()` named constructor the parser will: +By default, when calling the `new Parser()` the parser will: - try to parse the first table found in the page - expect the table header row to be the first `tr` found in the `thead` section of your table - exclude the table `thead` section when extracting the table content. - ignore XML errors. - have no formatter attached. -- have no default caption to used if none is present in the table. +- have no default caption to be used if none is present in the table. Each of the following settings can be changed to improve the conversion against your business rules: @@ -163,9 +164,9 @@ If the expression is valid, and a list of table is found, the first result will ```php use Bakame\HtmlTable\Parser; -$parser = Parser::new()->tablePosition('table-id'); // parses the -$parser = Parser::new()->tablePosition(3); // parses the 4th table of the page -$parser = Parser::new()->tableXPathPosition("//main/div/table"); +$parser = (new Parser())->tablePosition('table-id'); // parses the
+$parser = (new Parser())->tablePosition(3); // parses the 4th table of the page +$parser = (new Parser())->tableXPathPosition("//main/div/table"); //parse the first table that matches the xpath expression ``` @@ -179,8 +180,8 @@ You can optionally define a caption for your table if none is present or found d ```php use Bakame\HtmlTable\Parser; -$parser = Parser::new()->tableCaption('this is a generated caption'); -$parser = Parser::new()->tableCaption(null); // remove any default caption set +$parser = (new Parser())->tableCaption('this is a generated caption'); +$parser = (new Parser())->tableCaption(null); // remove any default caption set ``` ### tableHeader, tableHeaderPosition, ignoreTableHeader and resolveTableHeader @@ -197,7 +198,7 @@ Tells where to locate and resolve the table header use Bakame\HtmlTable\Parser; use Bakame\HtmlTable\Section; -$parser = Parser::new()->tableHeaderPosition(Section::Thead, 3); +$parser = (new Parser())->tableHeaderPosition(Section::Thead, 3); // header is the 4th row in the table section ``` @@ -227,25 +228,25 @@ If no resolution is done, no header will be included in the returned `Table` ins ```php use Bakame\HtmlTable\Parser; -$parser = Parser::new()->ignoreTableHeader(); // no table header will be resolved -$parser = Parser::new()->resolveTableHeader(); // will attempt to resolve the table header +$parser = (new Parser())->ignoreTableHeader(); // no table header will be resolved +$parser = (new Parser())->resolveTableHeader(); // will attempt to resolve the table header ``` #### tableHeader -You can specify directly the header of your table and override any other table header +You can directly specify the header of your table and override any other table header related configuration with this configuration ```php use Bakame\HtmlTable\Parser; use Bakame\HtmlTable\Section; -$parser = Parser::new()->tableHeader(['rank', 'team', 'winner']); +$parser = (new Parser())->tableHeader(['rank', 'team', 'winner']); ``` **If you specify a non-empty array as the table header, it will take precedence over any other table header related options.** -**Because it is a tabular data each cell MUST be unique otherwise an exception will be thrown** +**Because it is tabular data, each cell MUST be unique otherwise an exception will be thrown** You can skip or re-arrange the source columns by skipping them by their offsets and/or by re-ordering the offsets. @@ -254,8 +255,8 @@ re-ordering the offsets. use Bakame\HtmlTable\Parser; use Bakame\HtmlTable\Section; -$parser = Parser::new()->tableHeader([3 => 'rank', 7 => 'winner', 5 => 'team']); -// only 3 column will be extracted the 4th, 6th and 8th columns +$parser = (new Parser())->tableHeader([3 => 'rank', 7 => 'winner', 5 => 'team']); +// only 3 columns will be extracted the 4th, 6th and 8th columns // and re-arrange as 'rank' first and 'team' last // if a column is missing its value will be PHP `null` type ``` @@ -268,34 +269,34 @@ Tells which section should be parsed based on the `Section` enum use Bakame\HtmlTable\Parser; use Bakame\HtmlTable\Section; -$parser = Parser::new()->includeSection(Section::Tbody); // thead and tfoot are included during parsing -$parser = Parser::new()->excludeSection(Section::Tr, Section::Tfoot); // table direct tr children and tfoot are not included during parsing +$parser = (new Parser())->includeSection(Section::Tbody); // thead and tfoot are included during parsing +$parser = (new Parser())->excludeSection(Section::Tr, Section::Tfoot); // table direct tr children and tfoot are not included during parsing ``` **By default, the `thead` section is not parse. If a `thead` row is selected to be the header, it will be parsed independently of this setting.** -**⚠️Tips:** to be sure of which sections will be modified, first remove all previous setting +**⚠️Tips:** to be sure of which sections will be modified, first remove all previous settings before applying your configuration as shown below: ```diff -- Parser::new()->includeSection(Section::tbody); -+ Parser::new()->excludeSection(...Section::cases())->includeSection(Section::tbody); +- (new Parser())->includeSection(Section::tbody); ++ (new Parser())->excludeSection(...Section::cases())->includeSection(Section::tbody); ``` The first call will still include the `tfoot` and the `tr` sections, whereas the second call -remove any previous setting guaranting that only the `tbody` if present will be parsed. +removes any previous setting guaranting that only the `tbody` if present will be parsed. ### withFormatter and withoutFormatter -Adds or remove a record formatter applied to the data extracted from the table before you +Add or remove a record formatter applied to the data extracted from the table before you can access it. The header is not affected by the formatter if it is defined. ```php use Bakame\HtmlTable\Parser; -$parser = Parser::new()->withFormatter($formatter); // attach a formatter to the parser -$parser = Parser::new()->withoutFormatter(); // removed the attached formatter if it exists +$parser = (new Parser())->withFormatter($formatter); // attach a formatter to the parser +$parser = (new Parser())->withFormatter(null); // removed the attached formatter if it exists ``` The formatter closure signature should be: @@ -304,7 +305,7 @@ The formatter closure signature should be: function (array $record): array; ``` -If a header was defined or specified, the submitted record will have the header definition set, +If a header was defined or specified, the submitted record will have the header definition set; otherwise an array list is provided. The following formatter will work on any table content as long as it is defined as a string. @@ -333,8 +334,8 @@ Tells whether the parser should ignore or throw in case of malformed HTML conten ```php use Bakame\HtmlTable\Parser; -$parser = Parser::new()->ignoreXmlErrors(); // ignore the XML errors -$parser = Parser::new()->failOnXmlErrors(3); // throw on XML errors +$parser = (new Parser())->ignoreXmlErrors(); // ignore the XML errors +$parser = (new Parser())->failOnXmlErrors(3); // throw on XML errors ``` ## Testing diff --git a/composer.json b/composer.json index 4e57a13..525b469 100644 --- a/composer.json +++ b/composer.json @@ -30,9 +30,7 @@ "ext-libxml": "*", "ext-mbstring": "*", "ext-simplexml": "*", - "bakame/aide-enums": "^0.1.0", - "bakame/aide-error": "^0.2.0", - "league/csv": "^9.23.0" + "league/csv": "^9.25.0" }, "require-dev": { "ext-curl": "*", @@ -54,7 +52,7 @@ } }, "scripts": { - "phpcs": "PHP_CS_FIXER_IGNORE_ENV=1 php-cs-fixer fix -vvv --diff --dry-run --allow-risky=yes --ansi", + "phpcs": "php-cs-fixer fix -vvv --diff --dry-run --allow-risky=yes --ansi", "phpcs:fix": "php-cs-fixer fix -vvv --allow-risky=yes --ansi", "phpstan": "phpstan analyse -c phpstan.neon --ansi --memory-limit=192M", "phpunit": "XDEBUG_MODE=coverage phpunit --coverage-text", diff --git a/src/Feature.php b/src/Feature.php new file mode 100644 index 0000000..e8c59b5 --- /dev/null +++ b/src/Feature.php @@ -0,0 +1,11 @@ + $tableHeader * @param array
$includedSections */ - private function __construct( - private readonly string $tableExpression, - private readonly ?string $caption, - private readonly array $tableHeader, - private readonly bool $ignoreTableHeader, - private readonly string $tableHeaderExpression, - private readonly array $includedSections, - private readonly ?Closure $formatter, - private readonly bool $throwOnXmlErrors, + public function __construct( + private readonly string $tableExpression = '(//table)[1]', + private readonly ?string $caption = null, + private readonly array $tableHeader = [], + private readonly Feature $ignoreTableHeader = Feature::Disabled, + private readonly string $tableHeaderExpression = '(//table/thead/tr)[1]', + private readonly array $includedSections = [Section::Tbody, Section::Tfoot, Section::Tr], + private readonly ?Closure $formatter = null, + private readonly Feature $throwOnXmlErrors = Feature::Disabled, ) { } - public static function new(): self - { - return new self( - '(//table)[1]', - null, - [], - false, - '(//table/thead/tr)[1]', - [Section::Tbody, Section::Tfoot, Section::Tr], - null, - false, - ); - } - public function tableXPathPosition(string $expression): self { - $query = (new DOMXPath(new DOMDocument()))->query(...); - $domXPath = Cloak::warning($query); + if ($expression === $this->tableExpression) { + return $this; + } - return match (true) { - $expression === $this->tableExpression => $this, - false === $domXPath($expression) => throw new ParserError( + try { + Warning::trap((new DOMXPath(new DOMDocument()))->query(...), $expression); + } catch (ErrorException $exception) { + throw new ParserError( message: 'The xpath expression `'.$expression.'` is invalid.', - previous: $domXPath->errors()->last() - ), - default => new self( - $expression, - $this->caption, - $this->tableHeader, - $this->ignoreTableHeader, - $this->tableHeaderExpression, - $this->includedSections, - $this->formatter, - $this->throwOnXmlErrors, - ), - }; + previous: $exception + ); + } + + return new self( + $expression, + $this->caption, + $this->tableHeader, + $this->ignoreTableHeader, + $this->tableHeaderExpression, + $this->includedSections, + $this->formatter, + $this->throwOnXmlErrors, + ); } /** @@ -135,13 +132,13 @@ public function tableHeader(array $headerRow): self public function ignoreTableHeader(): self { - return match ($this->ignoreTableHeader) { + return match (Feature::Enabled === $this->ignoreTableHeader) { true => $this, false => new self( $this->tableExpression, $this->caption, $this->tableHeader, - true, + Feature::Enabled, $this->tableHeaderExpression, $this->includedSections, $this->formatter, @@ -152,13 +149,13 @@ public function ignoreTableHeader(): self public function resolveTableHeader(): self { - return match ($this->ignoreTableHeader) { + return match (Feature::Disabled === $this->ignoreTableHeader) { false => $this, true => new self( $this->tableExpression, $this->caption, $this->tableHeader, - false, + Feature::Disabled, $this->tableHeaderExpression, $this->includedSections, $this->formatter, @@ -258,7 +255,7 @@ public function excludeSection(Section ...$sections): self public function failOnXmlErrors(): self { - return match ($this->throwOnXmlErrors) { + return match (Feature::Enabled === $this->throwOnXmlErrors) { true => $this, false => new self( $this->tableExpression, @@ -268,14 +265,14 @@ public function failOnXmlErrors(): self $this->tableHeaderExpression, $this->includedSections, $this->formatter, - true, + Feature::Enabled, ), }; } public function ignoreXmlErrors(): self { - return match ($this->throwOnXmlErrors) { + return match (Feature::Disabled === $this->throwOnXmlErrors) { false => $this, true => new self( $this->tableExpression, @@ -285,29 +282,15 @@ public function ignoreXmlErrors(): self $this->tableHeaderExpression, $this->includedSections, $this->formatter, - false, + Feature::Disabled, ), }; } - public function withFormatter(Closure $formatter): self - { - return new self( - $this->tableExpression, - $this->caption, - $this->tableHeader, - $this->ignoreTableHeader, - $this->tableHeaderExpression, - $this->includedSections, - $formatter, - $this->throwOnXmlErrors, - ); - } - - public function withoutFormatter(): self + public function withFormatter(?Closure $formatter): self { - return match (null) { - $this->formatter => $this, + return match (true) { + $formatter === $this->formatter => $this, default => new self( $this->tableExpression, $this->caption, @@ -315,7 +298,7 @@ public function withoutFormatter(): self $this->ignoreTableHeader, $this->tableHeaderExpression, $this->includedSections, - null, + $formatter, $this->throwOnXmlErrors, ), }; @@ -339,7 +322,7 @@ public function tableCaption(?string $caption): self } /** - * @param resource|string $filenameOrStream + * @param SplFileInfo|resource|string $filenameOrStream * @param resource|null $filenameContext * * @throws ParserError @@ -349,20 +332,21 @@ public function tableCaption(?string $caption): self */ public function parseFile(mixed $filenameOrStream, $filenameContext = null): Table { + if ($filenameOrStream instanceof SplFileInfo) { + return $this->parseHtml($filenameOrStream); + } + if (is_resource($filenameOrStream)) { return $this->parseHtml($this->streamToString($filenameOrStream)); } - $fopen = Cloak::warning(fopen(...)); - $resource = $fopen(...match ($filenameContext) { - null => [$filenameOrStream, 'r'], - default => [$filenameOrStream, 'r', false, $filenameContext], - }); - - if (!is_resource($resource)) { + try { + /** @var resource $resource */ + $resource = Warning::trap(fopen(...), ...['filename' => $filenameOrStream, 'mode' => 'r', 'context' => $filenameContext]); + } catch (ErrorException $exception) { throw new ParserError( message: '`'.$filenameOrStream.'`: failed to open stream: No such file or directory.', - previous: $fopen->errors()->last() + previous: $exception ); } @@ -375,27 +359,23 @@ public function parseFile(mixed $filenameOrStream, $filenameContext = null): Tab /** * @throws ParserError * @throws SyntaxError + * @throws CannotInsertRecord * * @return Table> */ - public function parseHtml(DOMDocument|DOMElement|SimpleXMLElement|Stringable|string $source): Table + public function parseHtml(SplFileInfo|DOMDocument|DOMElement|SimpleXMLElement|Stringable|string $source): Table { /** @var DOMNodeList $query */ $query = (new DOMXPath($this->sourceToDomDocument($source)))->query($this->tableExpression); $table = $query->item(0); - if (!$table instanceof DOMElement) { - throw new ParserError('The HTML table could not be found in the submitted html.'); - } - + $table instanceof DOMElement || throw new ParserError('The HTML table could not be found in the submitted html.'); $tagName = strtolower($table->nodeName); - if ('table' !== $tagName) { - throw new ParserError('Expected a table element to be selected; received `'.$tagName.'` instead.'); - } + 'table' === $tagName || throw new ParserError('Expected a table element to be selected; received `'.$tagName.'` instead.'); $xpath = new DOMXPath($this->sourceToDomDocument($table)); $header = match (true) { [] !== $this->tableHeader => $this->tableHeader, - $this->ignoreTableHeader => [], + Feature::Enabled === $this->ignoreTableHeader => [], default => $this->extractTableHeader($xpath), }; @@ -421,20 +401,20 @@ public function parseHtml(DOMDocument|DOMElement|SimpleXMLElement|Stringable|str */ private function streamToString($stream): string { - $stream_get_contents = Cloak::warning(stream_get_contents(...)); - /** @var string|false $html */ - $html = $stream_get_contents($stream); + try { + /** @var string $result */ + $result = Warning::trap(stream_get_contents(...), $stream); - return match (false) { - $html => throw new ParserError('The resource could not be read.', 0, $stream_get_contents->errors()->last()), - default => $html, - }; + return $result; + } catch (ErrorException $exception) { + throw new ParserError(message: 'The resource could not be read.', previous: $exception); + } } /** * @throws ParserError */ - private function sourceToDomDocument(DOMDocument|SimpleXMLElement|DOMElement|Stringable|string $document): DOMDocument + private function sourceToDomDocument(SplFileInfo|DOMDocument|SimpleXMLElement|DOMElement|Stringable|string $document): DOMDocument { if ($document instanceof DOMDocument) { return $document; @@ -453,13 +433,22 @@ private function sourceToDomDocument(DOMDocument|SimpleXMLElement|DOMElement|Str return $dom; } + $content = (string) $document; + if ($document instanceof SplFileInfo) { + $content = ''; + $file = $document->openFile(); + while (!$file->eof()) { + $content .= $file->fgets(); + } + } + libxml_use_internal_errors(true); - $dom->loadHTML((string) $document); + $dom->loadHTML($content, LIBXML_NOWARNING | LIBXML_NOERROR); $errors = libxml_get_errors(); libxml_clear_errors(); return match (true) { - $this->throwOnXmlErrors && [] !== $errors => throw ParserError::dueToLibXmlErrors($errors), + Feature::Enabled === $this->throwOnXmlErrors && [] !== $errors => throw ParserError::dueToLibXmlErrors($errors), default => $dom, }; } @@ -547,7 +536,7 @@ private function extractHeaderRow(DOMElement $tr): array $tr->setAttribute(self::HEADER_ROW_ATTRIBUTE_NAME, 'true'); } - return $headerRow; + return array_map(fn (string|null $item): string => trim((string) $item, "\u{A0} \t\n\r\0\x0B"), $headerRow); } /** @@ -631,4 +620,30 @@ private function combineArray(array $record, array $header): array return $row; } + + /** + * DEPRECATION WARNING! This method will be removed in the next major point release. + * @deprecated since version 0.6.0 + * @see self::withFormatter() + * + * @codeCoverageIgnore + */ + #[Deprecated(message:'use Bakame\TabularData\HtmlTable\Parser::withFormatter() instead', since:'bakame/html-table:0.6.0')] + public function withoutFormatter(): self + { + return $this->withFormatter(null); + } + + /** + * DEPRECATION WARNING! This method will be removed in the next major point release. + * @deprecated since version 0.6.0 + * @see self::__construct() + * + * @codeCoverageIgnore + */ + #[Deprecated(message:'use Bakame\TabularData\HtmlTable\Parser::__construct() instead', since:'bakame/html-table:0.6.0')] + public static function new(): self + { + return new self(); + } } diff --git a/src/ParserError.php b/src/ParserError.php index 95b98fb..400baed 100644 --- a/src/ParserError.php +++ b/src/ParserError.php @@ -4,7 +4,7 @@ namespace Bakame\TabularData\HtmlTable; -use InvalidArgumentException; +use Exception; use LibXMLError; use function array_count_values; @@ -16,7 +16,7 @@ use const PHP_EOL; -class ParserError extends InvalidArgumentException +class ParserError extends Exception { /** @var array */ private array $duplicateColumnNames = []; diff --git a/src/ParserErrorTest.php b/src/ParserErrorTest.php index ce51ca0..3f2e9ee 100644 --- a/src/ParserErrorTest.php +++ b/src/ParserErrorTest.php @@ -4,9 +4,11 @@ namespace Bakame\TabularData\HtmlTable; +use PHPUnit\Framework\Attributes\CoversClass; use PHPUnit\Framework\Attributes\Test; use PHPUnit\Framework\TestCase; +#[CoversClass(ParserError::class)] final class ParserErrorTest extends TestCase { #[Test] diff --git a/src/ParserTest.php b/src/ParserTest.php index 174733e..35e6226 100644 --- a/src/ParserTest.php +++ b/src/ParserTest.php @@ -7,11 +7,17 @@ use DOMDocument; use DOMElement; use League\Csv\TabularDataReader; +use PHPUnit\Framework\Attributes\CoversClass; use PHPUnit\Framework\Attributes\DataProvider; use PHPUnit\Framework\Attributes\Test; use PHPUnit\Framework\TestCase; use SimpleXMLElement; +#[CoversClass(Warning::class)] +#[CoversClass(Parser::class)] +#[CoversClass(Feature::class)] +#[CoversClass(Section::class)] +#[CoversClass(Table::class)] final class ParserTest extends TestCase { private const HTML = <<
tablePosition(0) - ->tableHeaderPosition(Section::Thead, 0) + ->tableHeaderPosition(Section::Thead) ->includeSection(Section::Tbody, Section::Tfoot, Section::Tr) ->tableHeader([]) ->resolveTableHeader() ->ignoreXmlErrors() - ->withoutFormatter() + ->withFormatter(null) ->tableCaption(null) ); } @@ -63,7 +69,7 @@ public function it_will_throw_if_the_header_contains_duplicate_values(): void $this->expectException(ParserError::class); $this->expectExceptionMessage('The header record contains duplicate column names: `foo`, `toto`.'); - Parser::new()->tableHeader($headerRow); + (new Parser())->tableHeader($headerRow); } #[Test] @@ -71,7 +77,7 @@ public function it_will_throw_if_the_header_does_not_only_contains_string(): voi { $this->expectException(ParserError::class); - Parser::new()->tableHeader(['foo', 1]); /* @phpstan-ignore-line */ + (new Parser())->tableHeader(['foo', 1]); /* @phpstan-ignore-line */ } #[Test] @@ -80,7 +86,7 @@ public function it_will_throw_if_the_identifier_is_invalid(string|int $identifie { $this->expectException(ParserError::class); - Parser::new()->tablePosition($identifier); + (new Parser())->tablePosition($identifier); } /** @@ -102,7 +108,7 @@ public function it_will_throw_if_the_identifier_is_a_negative_integer(): void { $this->expectException(ParserError::class); - Parser::new()->tablePosition(-1); + (new Parser())->tablePosition(-1); } #[Test] @@ -110,7 +116,7 @@ public function it_will_throw_if_the_table_header_row_offset_is_negative(): void { $this->expectException(ParserError::class); - Parser::new()->tableHeaderPosition(Section::Thead, -1); /* @phpstan-ignore-line */ + (new Parser())->tableHeaderPosition(Section::Thead, -1); /* @phpstan-ignore-line */ } #[Test] @@ -118,7 +124,7 @@ public function it_will_throw_if_the_xpath_expression_is_invalid(): void { $this->expectException(ParserError::class); - Parser::new()->tableXPathPosition('//table@@invalid'); + (new Parser())->tableXPathPosition('//table@@invalid'); } #[Test] @@ -129,14 +135,14 @@ public function it_will_fail_to_load_any_element_other_than_a_table(): void HTML; $this->expectException(ParserError::class); $this->expectExceptionMessage('Expected a table element to be selected; received `p` instead.'); - Parser::new()->tableXPathPosition('//p')->parseHtml($html); + (new Parser())->tableXPathPosition('//p')->parseHtml($html); } #[Test] public function it_can_load_the_first_html_table_found_by_default(): void { - $table = Parser::new()->parseHtml(self::HTML); + $table = (new Parser())->parseHtml(self::HTML); $header = ['prenoms', 'nombre', 'sexe', 'annee']; $row = [ 'prenoms' => 'Abdoulaye', @@ -147,16 +153,16 @@ public function it_can_load_the_first_html_table_found_by_default(): void self::assertSame(['prenoms', 'nombre', 'sexe', 'annee'], $table->getHeader()); self::assertCount(4, $table); - self::assertSame($row, $table->first()); + self::assertSame($row, $table->getTabularData()->first()); - $sliced = $table->slice(0, 1); - self::assertSame(['caption' => null, 'header' => $header, 'rows' => [$row]], $sliced->jsonSerialize()); + $sliced = $table->getTabularData()->slice(0, 1); + self::assertSame([$row], iterator_to_array($sliced)); } #[Test] public function it_can_load_the_first_html_table_found_by_default_without_the_header(): void { - $table = Parser::new()->ignoreTableHeader()->parseHtml(self::HTML); + $table = (new Parser())->ignoreTableHeader()->parseHtml(self::HTML); self::assertSame([], $table->getHeader()); self::assertCount(4, $table); @@ -165,13 +171,13 @@ public function it_can_load_the_first_html_table_found_by_default_without_the_he '15', 'M', '2004', - ], $table->first()); + ], $table->getTabularData()->first()); } #[Test] public function it_can_load_any_html_table_by_occurrence(): void { - $table = Parser::new() + $table = (new Parser()) ->tablePosition(1) ->parseFile(dirname(__DIR__).'/test_files/table.html'); @@ -182,7 +188,7 @@ public function it_can_load_any_html_table_by_occurrence(): void #[Test] public function it_can_load_any_html_table_by_attribute_id(): void { - $table = Parser::new() + $table = (new Parser()) ->tablePosition('testb') ->parseFile(dirname(__DIR__).'/test_files/table.html'); @@ -195,7 +201,7 @@ public function it_uses_the_table_first_tr_to_search_for_the_header(): void { /** @var resource $stream */ $stream = fopen(dirname(__DIR__).'/test_files/table.html', 'r'); - $table = Parser::new() + $table = (new Parser()) ->tablePosition('testb') ->tableHeaderPosition(Section::Tr) ->parseFile($stream); @@ -207,7 +213,7 @@ public function it_uses_the_table_first_tr_to_search_for_the_header(): void 'nombre' => '15', 'sexe' => 'M', 'annee' => '2004', - ], $table->first()); + ], $table->getTabularData()->first()); fclose($stream); } @@ -217,7 +223,7 @@ public function it_will_fail_to_load_a_missing_file(): void { $this->expectException(ParserError::class); - Parser::new()->parseFile('/path/tp/my/heart.html'); + (new Parser())->parseFile('/path/tp/my/heart.html'); } #[Test] @@ -236,7 +242,7 @@ public function it_uses_the_table_first_tr_in_the_first_tbody_to_search_for_the_
TABLE; - $table = Parser::new() + $table = (new Parser()) ->tableHeaderPosition(Section::Tbody) ->parseHtml($html); @@ -247,7 +253,7 @@ public function it_uses_the_table_first_tr_in_the_first_tbody_to_search_for_the_ 'nombre' => '15', 'sexe' => 'M', 'annee' => '2004', - ], $table->nth(0)); + ], $table->getTabularData()->nth(0)); } #[Test] @@ -255,7 +261,7 @@ public function it_will_throw_if_the_html_is_malformed(): void { $this->expectExceptionObject(new ParserError('The HTML table could not be found in the submitted html.')); - Parser::new()->parseHtml('vasdfadadf'); + (new Parser())->parseHtml('vasdfadadf'); } #[Test] @@ -263,13 +269,13 @@ public function it_will_throw_if_no_table_is_found(): void { $this->expectExceptionObject(new ParserError('The HTML table could not be found in the submitted html.')); - Parser::new()->parseHtml('
  1. foo
'); + (new Parser())->parseHtml('
  1. foo
'); } #[Test] public function it_will_use_the_submitted_headers(): void { - $parser = Parser::new() + $parser = (new Parser()) ->tableHeader(['firstname', 'count', 'gender', 'year']); $table = $parser->parseHtml(self::HTML); @@ -280,7 +286,7 @@ public function it_will_use_the_submitted_headers(): void 'count' => '15', 'gender' => 'M', 'year' => '2004', - ], $table->first()); + ], $table->getTabularData()->first()); } @@ -299,7 +305,7 @@ public function it_will_rearrange_the_content_with_table_header(): void TABLE; $header = [3 => 'Annee', 2 => 'Sexe', 0 => 'Firstname', 1 => 'Count']; - $table = Parser::new() + $table = (new Parser()) ->tableHeader($header) ->parseHtml($html); @@ -309,10 +315,10 @@ public function it_will_rearrange_the_content_with_table_header(): void 'Sexe' => 'M', 'Firstname' => 'Abel', 'Count' => '14', - ], $table->first()); + ], $table->getTabularData()->first()); $header = [3 => 'Annee', 0 => 'Firstname', 1 => 'Count']; - $table = Parser::new() + $table = (new Parser()) ->tableHeader($header) ->parseHtml($html); @@ -321,7 +327,7 @@ public function it_will_rearrange_the_content_with_table_header(): void 'Annee' => '2004', 'Firstname' => 'Abel', 'Count' => '14', - ], $table->first()); + ], $table->getTabularData()->first()); } #[Test] @@ -338,10 +344,11 @@ public function it_will_duplicate_colspan_data(): void TABLE; - $table = Parser::new()->parseHtml($html); + $table = (new Parser())->parseHtml($html); + $data = $table->getTabularData(); - self::assertSame($table->nth(1), ['Abdoulaye', 'Abdoulaye', 'Abdoulaye', '2004']); - self::assertSame($table->nth(0), ['prenoms', 'nombre', 'sexe', 'annee']); + self::assertSame($data->nth(1), ['Abdoulaye', 'Abdoulaye', 'Abdoulaye', '2004']); + self::assertSame($data->nth(0), ['prenoms', 'nombre', 'sexe', 'annee']); } #[Test] @@ -361,11 +368,13 @@ public function it_will_ignore_the_malformed_header_by_deault(): void $dom = new DOMDocument(); $dom->loadHTML($html); - $table = Parser::new()->parseHtml($dom); + $table = (new Parser())->parseHtml($dom); + + $tabularData = $table->getTabularData(); self::assertSame([], $table->getHeader()); - self::assertSame($table->first(), ['Abdoulaye', 'Abdoulaye', 'Abdoulaye', '2004']); - self::assertSame($table->nth(1), ['Abel', '14', 'M', '2004']); + self::assertSame($tabularData->first(), ['Abdoulaye', 'Abdoulaye', 'Abdoulaye', '2004']); + self::assertSame($tabularData->nth(1), ['Abel', '14', 'M', '2004']); } #[Test] @@ -377,7 +386,7 @@ public function it_will_fails_on_malformed_html(): void $this->expectException(ParserError::class); - Parser::new() + (new Parser()) ->failOnXmlErrors() ->parseHtml($html); } @@ -387,7 +396,7 @@ public function it_will_fail_to_load_other_html_tag(): void { $this->expectException(ParserError::class); - Parser::new()->parseHtml(new DOMElement('p', 'I know who you are')); + (new Parser())->parseHtml(new DOMElement('p', 'I know who you are')); } #[Test] @@ -403,7 +412,7 @@ public function it_will_found_no_header(): void /** @var SimpleXMLElement $simpleXML */ $simpleXML = simplexml_load_string($html); - $table = Parser::new() + $table = (new Parser()) ->tableHeaderPosition(Section::Tbody) ->parseHtml($simpleXML); @@ -422,7 +431,7 @@ public function it_will_found_no_header_in_any_section(): void TABLE; - $table = Parser::new() + $table = (new Parser()) ->tableHeaderPosition(Section::Tr) ->parseHtml($html); @@ -444,12 +453,12 @@ public function it_will_use_the_table_footer(): void TABLE; - $table = Parser::new() + $table = (new Parser()) ->excludeSection(Section::Tfoot) ->parseHtml($html); self::assertSame([], $table->getHeader()); - self::assertSame([], $table->first()); + self::assertSame([], $table->getTabularData()->first()); } #[Test] @@ -457,7 +466,7 @@ public function it_uses_the_parser_formatter(): void { /** @var resource $stream */ $stream = fopen(dirname(__DIR__).'/test_files/table.html', 'r'); - $table = Parser::new() + $table = (new Parser()) ->tablePosition('testb') ->tableHeaderPosition(Section::Tr) ->withFormatter(function (array $record): array { @@ -476,7 +485,7 @@ public function it_uses_the_parser_formatter(): void 'nombre' => 15, 'sexe' => 'M', 'annee' => 2004, - ], $table->first()); + ], $table->getTabularData()->first()); fclose($stream); } @@ -532,18 +541,18 @@ public function it_can_handle_rowspan_and_colspan(): void fn (int $carry, array $record): int => $carry + (array_count_values($record)[$value] ?? 0), 0 ); - $table = Parser::new()->parseHtml($table); + $table = (new Parser())->parseHtml($table); - self::assertSame(2, $reducer($table, 'colspan')); - self::assertSame(2, $reducer($table, 'rowspan')); - self::assertSame(6, $reducer($table, 'colspan+rowspan')); + self::assertSame(2, $reducer($table->getTabularData(), 'colspan')); + self::assertSame(2, $reducer($table->getTabularData(), 'rowspan')); + self::assertSame(6, $reducer($table->getTabularData(), 'colspan+rowspan')); } #[Test] #[DataProvider('providesCaption')] public function it_can_load_the_table_caption(string $table, ?string $defaultCaption, ?string $expected): void { - self::assertSame($expected, Parser::new()->tableCaption($defaultCaption)->parseHtml($table)->getCaption()); + self::assertSame($expected, (new Parser())->tableCaption($defaultCaption)->parseHtml($table)->getCaption()); } /** diff --git a/src/Section.php b/src/Section.php index ac1468e..d771b1e 100644 --- a/src/Section.php +++ b/src/Section.php @@ -4,12 +4,8 @@ namespace Bakame\TabularData\HtmlTable; -use Bakame\Aide\Enum\Helper; - enum Section: string { - use Helper; - case Thead = 'thead'; case Tbody = 'tbody'; case Tfoot = 'tfoot'; @@ -17,12 +13,12 @@ enum Section: string /** * @param int<0, max> $offset + * + * @throws ParserError */ public function xpathRow(int $offset = 0): string { - if ($offset < 0) { /* @phpstan-ignore-line */ - throw new ParserError('The table header row offset must be a positive integer or 0.'); - } + $offset > -1 || throw new ParserError('The table header row offset must be a positive integer or 0.'); /* @phpstan-ignore-line */ ++$offset; return match ($this) { diff --git a/src/Table.php b/src/Table.php index 581ce86..6c6d2b6 100644 --- a/src/Table.php +++ b/src/Table.php @@ -4,119 +4,29 @@ namespace Bakame\TabularData\HtmlTable; -use Closure; +use Countable; use Iterator; +use IteratorAggregate; use JsonSerializable; +use League\Csv\TabularDataProvider; use League\Csv\TabularDataReader; /** * @template TValue of array * - * @implements TabularDataReader> + * @implements IteratorAggregate */ -final class Table implements TabularDataReader, JsonSerializable +final class Table implements IteratorAggregate, Countable, JsonSerializable, TabularDataProvider { /** - * @param TabularDataReader> $tabularDataReader + * @param TabularDataReader $tabularData */ public function __construct( - private readonly TabularDataReader $tabularDataReader, + private readonly TabularDataReader $tabularData, private readonly ?string $caption = null ) { } - public function count(): int - { - return $this->tabularDataReader->count(); - } - - public function getIterator(): Iterator - { - return $this->tabularDataReader->getIterator(); - } - - /** - * @return array{ - * caption: ?string, - * header: array, - * rows:array> - * } - */ - public function jsonSerialize(): array - { - return [ - 'caption' => $this->caption, - 'header' => $this->getHeader(), - 'rows' => array_values([...$this->tabularDataReader]), - ]; - } - - public function each(Closure $closure): bool - { - return $this->tabularDataReader->each($closure); - } - - public function exists(Closure $closure): bool - { - return $this->tabularDataReader->exists($closure); - } - - /** - * @return array - */ - public function nth(int $nth_record): array - { - return $this->tabularDataReader->nth($nth_record); - } - - /** - * @return array - */ - public function first(): array - { - return $this->tabularDataReader->first(); - } - - /** - * - * @return Table> - */ - public function filter(Closure $closure): TabularDataReader - { - return new self($this->tabularDataReader->filter($closure), $this->caption); - } - - public function fetchColumnByName(string $name): Iterator - { - return $this->tabularDataReader->fetchColumnByName($name); - } - - public function fetchColumnByOffset(int $offset): Iterator - { - return $this->tabularDataReader->fetchColumnByOffset($offset); - } - - public function reduce(Closure $closure, mixed $initial = null): mixed - { - return $this->tabularDataReader->reduce($closure, $initial); - } - - /** - * @return Table> - */ - public function slice(int $offset, ?int $length = null): TabularDataReader - { - return new self($this->tabularDataReader->slice($offset, $length), $this->caption); - } - - /** - * @return Table> - */ - public function sorted(Closure $orderBy): TabularDataReader - { - return new self($this->tabularDataReader->sorted($orderBy), $this->caption); - } - public function getCaption(): ?string { return $this->caption; @@ -127,74 +37,40 @@ public function getCaption(): ?string */ public function getHeader(): array { - return $this->tabularDataReader->getHeader(); - } - - public function getRecords(array $header = []): Iterator - { - return $this->tabularDataReader->getRecords($header); + return $this->tabularData->getHeader(); } /** - * @param array $header + * @return TabularDataReader $tabularData */ - public function getObjects(string $className, array $header = []): Iterator - { - return $this->tabularDataReader->getObjects($className, $header); - } - - /** - * @return array - */ - public function fetchOne(int $nth_record = 0): array - { - return $this->tabularDataReader->fetchOne($nth_record); - } - - public function fetchPairs($offset_index = 0, $value_index = 1): Iterator + public function getTabularData(): TabularDataReader { - return $this->tabularDataReader->fetchPairs($offset_index, $value_index); + return $this->tabularData; } - public function fetchColumn($index = 0): Iterator - { - return $this->tabularDataReader->fetchColumn($index); - } - - /** - * @return TabularDataReader> - */ - public function select(string|int ...$columnOffsetOrName): TabularDataReader - { - return $this->tabularDataReader->select(...$columnOffsetOrName); - } - - /** @return iterable>> */ - public function matching(string $expression): iterable + public function count(): int { - return $this->tabularDataReader->matching($expression); + return $this->tabularData->count(); } - /** - * - * @return ?TabularDataReader> - */ - public function matchingFirst(string $expression): ?TabularDataReader + public function getIterator(): Iterator { - return $this->tabularDataReader->matchingFirst($expression); + return $this->tabularData->getIterator(); } /** - * - * @return TabularDataReader> + * @return array{ + * caption: ?string, + * header: array, + * rows:array> + * } */ - public function matchingFirstOrFail(string $expression): TabularDataReader - { - return $this->tabularDataReader->matchingFirstOrFail($expression); - } - - public function value(int|string $column = 0): mixed + public function jsonSerialize(): array { - return $this->tabularDataReader->value($column); + return [ + 'caption' => $this->caption, + 'header' => $this->getHeader(), + 'rows' => array_values([...$this->tabularData]), + ]; } } diff --git a/src/Warning.php b/src/Warning.php new file mode 100644 index 0000000..5e1e893 --- /dev/null +++ b/src/Warning.php @@ -0,0 +1,49 @@ + + in_array($errno, [E_WARNING, E_USER_WARNING], true) + ? throw new ErrorException($errstr, 0, $errno, $errfile, $errline) + : false + ); + + try { + return $callback(...$arguments); + } finally { + restore_error_handler(); + } + } +}