* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. */ /** * \file class/datanormparser.class.php * \ingroup importzugferd * \brief Parser for Datanorm 4.0 and 5.0 catalog files */ /** * Class DatanormParser * Parses Datanorm catalog files (Version 4.0 and 5.0) */ class DatanormParser { /** * @var string Detected Datanorm version */ public $version = ''; /** * @var array Parsed articles (only used for small imports) */ public $articles = array(); /** * @var array Parsed price information */ public $prices = array(); /** * @var array Product groups/categories */ public $groups = array(); /** * @var string Error message */ public $error = ''; /** * @var array Error messages */ public $errors = array(); /** * @var callable Callback for batch processing articles */ protected $batchCallback = null; /** * @var int Batch size for database inserts */ protected $batchSize = 1000; /** * @var array Current batch of articles */ protected $batchArticles = array(); /** * @var bool Whether to use streaming mode (for large files) */ protected $streamingMode = false; /** * Enable streaming mode for large files * In streaming mode, articles are processed in batches via callback * * @param callable $callback Function to call with batch of articles * @param int $batchSize Number of articles per batch */ public function enableStreaming($callback, $batchSize = 1000) { $this->streamingMode = true; $this->batchCallback = $callback; $this->batchSize = $batchSize; $this->batchArticles = array(); } /** * Disable streaming mode */ public function disableStreaming() { $this->streamingMode = false; $this->batchCallback = null; $this->batchArticles = array(); } /** * Add article to batch (streaming mode) or to articles array * * @param array $article Article data */ protected function addArticle($article) { if ($this->streamingMode && $this->batchCallback) { $this->batchArticles[$article['article_number']] = $article; if (count($this->batchArticles) >= $this->batchSize) { $this->flushBatch(); } } else { $this->articles[$article['article_number']] = $article; } } /** * Flush current batch to callback */ protected function flushBatch() { if (!empty($this->batchArticles) && $this->batchCallback) { // Merge prices into batch articles before flushing foreach ($this->batchArticles as $artNum => &$article) { if (isset($this->prices[$artNum])) { $article['price'] = $this->prices[$artNum]['price']; unset($this->prices[$artNum]); // Free memory } } unset($article); call_user_func($this->batchCallback, $this->batchArticles); $this->batchArticles = array(); } } /** * Parse a Datanorm file or directory * * @param string $path Path to file or directory * @return int Number of articles parsed, -1 on error */ public function parse($path) { if (is_dir($path)) { return $this->parseDirectory($path); } else { return $this->parseFile($path); } } /** * Parse all Datanorm files in a directory * * @param string $dir Directory path * @return int Number of articles parsed, -1 on error */ public function parseDirectory($dir) { $totalArticles = 0; // For non-streaming mode, load prices first // For streaming mode with very large files, prices must be handled separately if (!$this->streamingMode) { $priceFiles = glob($dir . '/DATPREIS.*'); if (!empty($priceFiles)) { $this->version = '4.0'; foreach ($priceFiles as $file) { $ext = strtoupper(pathinfo($file, PATHINFO_EXTENSION)); if (preg_match('/^\d{3}$/', $ext)) { $this->parseDatapreis4File($file); } } } } // Look for Datanorm 4.0 files (DATANORM.xxx) $files = glob($dir . '/DATANORM.*'); if (!empty($files)) { $this->version = '4.0'; foreach ($files as $file) { $ext = strtoupper(pathinfo($file, PATHINFO_EXTENSION)); if (preg_match('/^\d{3}$/', $ext)) { // Main article file (DATANORM.001, etc.) $count = $this->parseDatanorm4File($file); if ($count > 0) { $totalArticles += $count; } } elseif ($ext === 'WRG') { // Product groups file $this->parseDatanorm4Groups($file); } elseif ($ext === 'RAB') { // Discount groups file $this->parseDatanorm4Discounts($file); } } } // Merge prices into articles (non-streaming mode only) // In streaming mode, prices are merged in flushBatch() if (!$this->streamingMode && !empty($this->prices)) { $this->mergePricesIntoArticles(); } // Look for Datanorm 5.0 files (*.xml) $xmlFiles = glob($dir . '/*.xml'); foreach ($xmlFiles as $file) { if ($this->isDatanorm5File($file)) { $this->version = '5.0'; $count = $this->parseDatanorm5File($file); if ($count > 0) { $totalArticles += $count; } } } return $totalArticles; } /** * Parse a single file (auto-detect format) * * @param string $file File path * @return int Number of articles parsed, -1 on error */ public function parseFile($file) { if (!file_exists($file)) { $this->error = 'File not found: ' . $file; return -1; } // Check if XML (Datanorm 5.0) $content = file_get_contents($file, false, null, 0, 1000); if (strpos($content, 'version = '5.0'; return $this->parseDatanorm5File($file); } // Assume Datanorm 4.0 $this->version = '4.0'; return $this->parseDatanorm4File($file); } /** * Parse Datanorm 4.0 file (fixed-width format) * Uses streaming to handle large files * * @param string $file File path * @return int Number of articles parsed */ protected function parseDatanorm4File($file) { $handle = fopen($file, 'r'); if ($handle === false) { $this->error = 'Cannot read file: ' . $file; return -1; } $count = 0; $currentArticle = null; while (($line = fgets($handle)) !== false) { $line = rtrim($line, "\r\n"); // Convert encoding if needed (Datanorm 4 often uses ISO-8859-1 or CP850) if (!mb_check_encoding($line, 'UTF-8')) { $line = mb_convert_encoding($line, 'UTF-8', 'ISO-8859-1'); } if (strlen($line) < 2) { continue; } $recordType = substr($line, 0, 1); switch ($recordType) { case 'A': // Article master record $article = $this->parseDatanorm4TypeA($line); if ($article) { $this->addArticle($article); $currentArticle = $article['article_number']; $count++; } break; case 'B': // Article info/long text if ($currentArticle) { $this->parseDatanorm4TypeB($line, $currentArticle); } break; case 'P': // Price record $this->parseDatanorm4TypeP($line); break; } } fclose($handle); // Flush any remaining batch in streaming mode if ($this->streamingMode) { $this->flushBatch(); } else { // Merge prices into articles (only in non-streaming mode) $this->mergePricesIntoArticles(); } return $count; } /** * Parse Datanorm 4.0 Type A record (Article master) * Field positions based on Datanorm 4.0 specification * * @param string $line Record line * @return array|null Article data */ protected function parseDatanorm4TypeA($line) { // Minimum length check if (strlen($line) < 50) { return null; } // Datanorm 4.0 Type A field layout (semicolon-separated in newer versions) if (strpos($line, ';') !== false) { return $this->parseDatanorm4TypeASemicolon($line); } // Fixed-width format (classic) $article = array( 'article_number' => trim(substr($line, 1, 15)), // Pos 2-16: Artikelnummer 'matchcode' => trim(substr($line, 16, 12)), // Pos 17-28: Matchcode 'short_text1' => trim(substr($line, 28, 40)), // Pos 29-68: Kurztext 1 'short_text2' => trim(substr($line, 68, 40)), // Pos 69-108: Kurztext 2 'unit_code' => trim(substr($line, 108, 3)), // Pos 109-111: Mengeneinheit 'price_unit' => (int)trim(substr($line, 111, 5)), // Pos 112-116: Preiseinheit 'discount_group' => trim(substr($line, 116, 4)), // Pos 117-120: Rabattgruppe 'product_group' => trim(substr($line, 120, 7)), // Pos 121-127: Warengruppe 'manufacturer_ref' => trim(substr($line, 127, 15)), // Pos 128-142: Hersteller-Artikelnummer 'manufacturer_name' => trim(substr($line, 142, 20)), // Pos 143-162: Herstellername 'ean' => '', 'long_text' => '', 'price' => 0, ); // EAN if available (extended format) if (strlen($line) >= 175) { $article['ean'] = trim(substr($line, 162, 13)); } if (empty($article['article_number'])) { return null; } // Default price unit to 1 if not set if ($article['price_unit'] <= 0) { $article['price_unit'] = 1; } return $article; } /** * Parse Datanorm 4.0 Type A record (semicolon-separated format) * * @param string $line Record line * @return array|null Article data */ protected function parseDatanorm4TypeASemicolon($line) { $parts = explode(';', $line); if (count($parts) < 6) { return null; } // Detect format variant // Sonepar format: A;N;ArtNr;WG;Kurztext1;Kurztext2;PE;ME;METext;RabGrp;PreisGrp;WG2;... // Standard format: A;ArtNr;Matchcode;Kurztext1;Kurztext2;ME;PE;RabGrp;WG;... $firstField = trim($parts[0] ?? ''); if ($firstField === 'A' && isset($parts[1]) && strlen(trim($parts[1])) <= 2) { // Sonepar format: A;N;ArtNr;WG;Kurztext1;Kurztext2;PE;ME;METext;RabGrp;PreisGrp;WG2;... $article = array( 'article_number' => trim($parts[2] ?? ''), 'matchcode' => '', // Will be set from B record 'short_text1' => trim($parts[4] ?? ''), 'short_text2' => trim($parts[5] ?? ''), 'unit_code' => trim($parts[8] ?? trim($parts[7] ?? '')), // METext or ME 'price_unit' => (int)trim($parts[6] ?? '1'), // PE 'discount_group' => trim($parts[9] ?? ''), 'product_group' => trim($parts[3] ?? ''), // WG at position 3 'manufacturer_ref' => '', 'manufacturer_name' => '', 'ean' => '', 'long_text' => '', 'price' => 0, ); } else { // Standard format $article = array( 'article_number' => trim($parts[1] ?? ''), 'matchcode' => trim($parts[2] ?? ''), 'short_text1' => trim($parts[3] ?? ''), 'short_text2' => trim($parts[4] ?? ''), 'unit_code' => trim($parts[5] ?? ''), 'price_unit' => (int)trim($parts[6] ?? '1'), 'discount_group' => trim($parts[7] ?? ''), 'product_group' => trim($parts[8] ?? ''), 'manufacturer_ref' => trim($parts[14] ?? ''), 'manufacturer_name' => trim($parts[15] ?? ''), 'ean' => trim($parts[16] ?? ''), 'long_text' => '', 'price' => 0, ); } if (empty($article['article_number'])) { return null; } if ($article['price_unit'] <= 0) { $article['price_unit'] = 1; } return $article; } /** * Get article reference for modification (handles both streaming and non-streaming mode) * * @param string $articleNumber Article number * @return array|null Reference to article or null */ protected function &getArticleRef($articleNumber) { $null = null; if ($this->streamingMode) { if (isset($this->batchArticles[$articleNumber])) { return $this->batchArticles[$articleNumber]; } } else { if (isset($this->articles[$articleNumber])) { return $this->articles[$articleNumber]; } } return $null; } /** * Parse Datanorm 4.0 Type B record (Article info/long text) * * @param string $line Record line * @param string $articleNumber Current article number */ protected function parseDatanorm4TypeB($line, $articleNumber) { $article = &$this->getArticleRef($articleNumber); if ($article === null) { return; } if (strpos($line, ';') !== false) { $parts = explode(';', $line); // Sonepar format: B;N;ArtNr;Matchcode;... if (isset($parts[1]) && strlen(trim($parts[1])) <= 2) { // Get article number from B record to verify $bArticleNumber = trim($parts[2] ?? ''); if ($bArticleNumber === $articleNumber) { // Matchcode is at position 3 $matchcode = trim($parts[3] ?? ''); if (!empty($matchcode) && empty($article['matchcode'])) { $article['matchcode'] = $matchcode; } } } else { // Standard format: text at position 2 $text = trim($parts[2] ?? ''); if (!empty($text)) { if (!empty($article['long_text'])) { $article['long_text'] .= "\n"; } $article['long_text'] .= $text; } } } else { $text = trim(substr($line, 16)); if (!empty($text)) { if (!empty($article['long_text'])) { $article['long_text'] .= "\n"; } $article['long_text'] .= $text; } } } /** * Parse Datanorm 4.0 Type P record (Price) * * @param string $line Record line */ protected function parseDatanorm4TypeP($line) { if (strpos($line, ';') !== false) { $parts = explode(';', $line); $articleNumber = trim($parts[1] ?? ''); $priceType = trim($parts[2] ?? ''); $price = $this->parsePrice(trim($parts[3] ?? '0')); } else { $articleNumber = trim(substr($line, 1, 15)); $priceType = trim(substr($line, 16, 1)); $price = $this->parsePrice(trim(substr($line, 17, 12))); } if (!empty($articleNumber) && $price > 0) { $this->prices[$articleNumber] = array( 'price' => $price, 'price_type' => $priceType, ); } } /** * Parse Datanorm 4.0 product groups file (DATANORM.WRG) * * @param string $file File path */ protected function parseDatanorm4Groups($file) { $content = file_get_contents($file); if ($content === false) { return; } if (!mb_check_encoding($content, 'UTF-8')) { $content = mb_convert_encoding($content, 'UTF-8', 'ISO-8859-1'); } $lines = explode("\n", $content); foreach ($lines as $line) { $line = rtrim($line, "\r\n"); if (strlen($line) < 10) { continue; } if (strpos($line, ';') !== false) { $parts = explode(';', $line); $code = trim($parts[0] ?? ''); $name = trim($parts[1] ?? ''); } else { $code = trim(substr($line, 0, 7)); $name = trim(substr($line, 7)); } if (!empty($code)) { $this->groups[$code] = $name; } } } /** * Parse Datanorm 4.0 discount groups file (DATANORM.RAB) * * @param string $file File path */ protected function parseDatanorm4Discounts($file) { // Discount parsing - can be extended if needed } /** * Parse DATPREIS.xxx price file * Uses streaming to handle large files * * @param string $file File path */ protected function parseDatapreis4File($file) { $handle = fopen($file, 'r'); if ($handle === false) { return; } while (($line = fgets($handle)) !== false) { $line = rtrim($line, "\r\n"); // Convert encoding if needed if (!mb_check_encoding($line, 'UTF-8')) { $line = mb_convert_encoding($line, 'UTF-8', 'ISO-8859-1'); } if (strlen($line) < 10) { continue; } // DATPREIS format - semicolon separated if (strpos($line, ';') !== false) { $parts = explode(';', $line); $recordType = trim($parts[0] ?? ''); // P;A format - multiple articles per line // Format: P;A;ArtNr;PreisKz;Preis;PE;x;x;x;x;ArtNr2;PreisKz2;Preis2;... if ($recordType === 'P' && isset($parts[1]) && $parts[1] === 'A') { // Parse multiple price entries per line // Each entry is: ArtNr;PreisKz;Preis;PE;0;1;0;1;0 $i = 2; // Start after P;A while ($i < count($parts) - 2) { $articleNumber = trim($parts[$i] ?? ''); $priceType = trim($parts[$i + 1] ?? ''); $priceRaw = trim($parts[$i + 2] ?? '0'); // Price is in cents, convert to euros $price = (float)$priceRaw / 100; if (!empty($articleNumber) && $price > 0) { $this->prices[$articleNumber] = array( 'price' => $price, 'price_type' => $priceType, ); } // Move to next article (9 fields per article: ArtNr;Kz;Preis;PE;0;1;0;1;0) $i += 9; } } elseif ($recordType === 'P' || $recordType === '0') { // Simple format: P;ArtNr;PreisKz;Preis $articleNumber = trim($parts[1] ?? ''); $priceType = trim($parts[2] ?? ''); $priceRaw = trim($parts[3] ?? '0'); // Check if price is in cents (no decimal point) if (strpos($priceRaw, ',') === false && strpos($priceRaw, '.') === false) { $price = (float)$priceRaw / 100; } else { $price = $this->parsePrice($priceRaw); } if (!empty($articleNumber) && $price > 0) { $this->prices[$articleNumber] = array( 'price' => $price, 'price_type' => $priceType, ); } } } else { // Fixed width format $recordType = substr($line, 0, 1); if ($recordType === 'P' || $recordType === '0') { $articleNumber = trim(substr($line, 1, 15)); $priceType = trim(substr($line, 16, 1)); $priceRaw = trim(substr($line, 17, 12)); // Check if price is in cents if (strpos($priceRaw, ',') === false && strpos($priceRaw, '.') === false) { $price = (float)$priceRaw / 100; } else { $price = $this->parsePrice($priceRaw); } if (!empty($articleNumber) && $price > 0) { $this->prices[$articleNumber] = array( 'price' => $price, 'price_type' => $priceType, ); } } } } fclose($handle); } /** * Merge prices into articles */ protected function mergePricesIntoArticles() { foreach ($this->prices as $articleNumber => $priceData) { if (isset($this->articles[$articleNumber])) { $this->articles[$articleNumber]['price'] = $priceData['price']; } } } /** * Check if file is Datanorm 5.0 format * * @param string $file File path * @return bool */ protected function isDatanorm5File($file) { $content = file_get_contents($file, false, null, 0, 2000); return (strpos($content, 'error = 'XML parse error: ' . ($errors[0]->message ?? 'Unknown error'); libxml_clear_errors(); return -1; } $count = 0; // Register namespaces if present $namespaces = $xml->getNamespaces(true); // Find article nodes (various possible node names) $articleNodes = $xml->xpath('//Artikel') ?: $xml->xpath('//Article') ?: $xml->xpath('//article') ?: array(); foreach ($articleNodes as $node) { $article = $this->parseDatanorm5Article($node); if ($article) { $this->articles[$article['article_number']] = $article; $count++; } } return $count; } /** * Parse Datanorm 5.0 article node * * @param SimpleXMLElement $node Article XML node * @return array|null Article data */ protected function parseDatanorm5Article($node) { $article = array( 'article_number' => $this->getXmlValue($node, array('Artikelnummer', 'ArticleNumber', 'ArtNr', 'artNr')), 'matchcode' => $this->getXmlValue($node, array('Matchcode', 'matchcode')), 'short_text1' => $this->getXmlValue($node, array('Kurztext1', 'Kurztext', 'ShortText1', 'ShortText', 'Bezeichnung', 'Name')), 'short_text2' => $this->getXmlValue($node, array('Kurztext2', 'ShortText2')), 'long_text' => $this->getXmlValue($node, array('Langtext', 'LongText', 'Beschreibung', 'Description')), 'unit_code' => $this->getXmlValue($node, array('Mengeneinheit', 'Unit', 'ME')), 'price_unit' => (int)$this->getXmlValue($node, array('Preiseinheit', 'PriceUnit', 'PE')) ?: 1, 'price' => $this->parsePrice($this->getXmlValue($node, array('Preis', 'Price', 'Listenpreis', 'ListPrice'))), 'discount_group' => $this->getXmlValue($node, array('Rabattgruppe', 'DiscountGroup', 'RG')), 'product_group' => $this->getXmlValue($node, array('Warengruppe', 'ProductGroup', 'WG')), 'manufacturer_ref' => $this->getXmlValue($node, array('HerstellerArtNr', 'ManufacturerArticleNumber')), 'manufacturer_name' => $this->getXmlValue($node, array('Hersteller', 'Manufacturer')), 'ean' => $this->getXmlValue($node, array('EAN', 'GTIN', 'Barcode')), ); if (empty($article['article_number'])) { return null; } return $article; } /** * Get value from XML node trying multiple possible element names * * @param SimpleXMLElement $node XML node * @param array $names Possible element names * @return string Value or empty string */ protected function getXmlValue($node, $names) { foreach ($names as $name) { // Try as child element if (isset($node->$name)) { return trim((string)$node->$name); } // Try as attribute if (isset($node[$name])) { return trim((string)$node[$name]); } } return ''; } /** * Parse price string to float * * @param string $priceStr Price string * @return float Price value */ protected function parsePrice($priceStr) { if (empty($priceStr)) { return 0.0; } // Remove currency symbols and whitespace $priceStr = preg_replace('/[^\d,.\-]/', '', $priceStr); // Handle German number format (1.234,56) if (preg_match('/^\d{1,3}(\.\d{3})*,\d{2}$/', $priceStr)) { $priceStr = str_replace('.', '', $priceStr); $priceStr = str_replace(',', '.', $priceStr); } elseif (strpos($priceStr, ',') !== false && strpos($priceStr, '.') === false) { // Simple comma as decimal separator $priceStr = str_replace(',', '.', $priceStr); } return (float)$priceStr; } /** * Convert Datanorm unit code to UN/ECE code * * @param string $datanormUnit Datanorm unit code * @return string UN/ECE unit code */ public static function convertUnitCode($datanormUnit) { $mapping = array( 'ST' => 'C62', // Stück 'STK' => 'C62', // Stück 'PCE' => 'C62', // Piece 'M' => 'MTR', // Meter 'MTR' => 'MTR', // Meter 'CM' => 'CMT', // Zentimeter 'MM' => 'MMT', // Millimeter 'L' => 'LTR', // Liter 'LTR' => 'LTR', // Liter 'KG' => 'KGM', // Kilogramm 'G' => 'GRM', // Gramm 'M2' => 'MTK', // Quadratmeter 'M3' => 'MTQ', // Kubikmeter 'PAK' => 'PK', // Packung 'PAC' => 'PK', // Package 'SET' => 'SET', // Set 'ROL' => 'RL', // Rolle 'RLL' => 'RL', // Roll 'BDL' => 'BE', // Bündel 'KRT' => 'CT', // Karton 'CTN' => 'CT', // Carton ); $unit = strtoupper(trim($datanormUnit)); return $mapping[$unit] ?? 'C62'; // Default to piece } /** * Get all parsed articles * * @return array Articles */ public function getArticles() { return $this->articles; } /** * Find article by number * * @param string $articleNumber Article number to find * @return array|null Article data or null */ public function findArticle($articleNumber) { return $this->articles[$articleNumber] ?? null; } /** * Search articles by text * * @param string $searchText Search text * @param int $limit Maximum results * @return array Matching articles */ public function searchArticles($searchText, $limit = 50) { $results = array(); $searchText = strtolower($searchText); foreach ($this->articles as $article) { $searchFields = strtolower( $article['article_number'] . ' ' . $article['matchcode'] . ' ' . $article['short_text1'] . ' ' . $article['short_text2'] . ' ' . $article['ean'] . ' ' . $article['manufacturer_ref'] ); if (strpos($searchFields, $searchText) !== false) { $results[] = $article; if (count($results) >= $limit) { break; } } } return $results; } }