add docx metadata extractor tutorial

x4nth055 · x4nth055 · commit 369c05e2798f · 2024-04-11T10:48:24.000+01:00
diff --git a/README.md b/README.md
@@ -63,6 +63,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
     - [How to Build a Username Search Tool in Python](https://thepythoncode.com/code/social-media-username-finder-in-python). ([code](ethical-hacking/username-finder))
     - [How to Find Past Wi-Fi Connections on Windows in Python](https://thepythoncode.com/article/find-past-wifi-connections-on-windows-in-python). ([code](ethical-hacking/find-past-wifi-connections-on-windows))
     - [How to Remove Metadata from PDFs in Python](https://thepythoncode.com/article/how-to-remove-metadata-from-pdfs-in-python). ([code](ethical-hacking/pdf-metadata-remover))
+    - [How to Extract Metadata from Docx Files in Python](https://thepythoncode.com/article/docx-metadata-extractor-in-python). ([code](ethical-hacking/docx-metadata-extractor))
 
 - ### [Machine Learning](https://www.thepythoncode.com/topic/machine-learning)
     - ### [Natural Language Processing](https://www.thepythoncode.com/topic/nlp)
diff --git a/ethical-hacking/docx-metadata-extractor/README.md b/ethical-hacking/docx-metadata-extractor/README.md
@@ -0,0 +1 @@
+# [How to Extract Metadata from Docx Files in Python](https://thepythoncode.com/article/docx-metadata-extractor-in-python)
diff --git a/ethical-hacking/docx-metadata-extractor/docs_metadata_extractor.py b/ethical-hacking/docx-metadata-extractor/docs_metadata_extractor.py
@@ -0,0 +1,41 @@
+import docx  # Import the docx library for working with Word documents.
+from pprint import pprint  # Import the pprint function for pretty printing.
+
+def extract_metadata(docx_file):
+    doc = docx.Document(docx_file)  # Create a Document object from the Word document file.
+    core_properties = doc.core_properties  # Get the core properties of the document.
+
+    metadata = {}  # Initialize an empty dictionary to store metadata
+
+    # Extract core properties
+    for prop in dir(core_properties):  # Iterate over all properties of the core_properties object.
+        if prop.startswith('__'):  # Skip properties starting with double underscores (e.g., __elenent). Not needed
+            continue
+        value = getattr(core_properties, prop)  # Get the value of the property.
+        if callable(value):  # Skip callable properties (methods).
+            continue
+        if prop == 'created' or prop == 'modified' or prop == 'last_printed':  # Check for datetime properties.
+            if value:
+                value = value.strftime('%Y-%m-%d %H:%M:%S')  # Convert datetime to string format.
+            else:
+                value = None
+        metadata[prop] = value  # Store the property and its value in the metadata dictionary.
+
+    # Extract custom properties (if available).
+    try:
+        custom_properties = core_properties.custom_properties  # Get the custom properties (if available).
+        if custom_properties:  # Check if custom properties exist.
+            metadata['custom_properties'] = {}  # Initialize a dictionary to store custom properties.
+            for prop in custom_properties:  # Iterate over custom properties.
+                metadata['custom_properties'][prop.name] = prop.value  # Store the custom property name and value.
+    except AttributeError:
+        # Custom properties not available in this version.
+        pass  # Skip custom properties extraction if the attribute is not available.
+
+    return metadata  # Return the metadata dictionary.
+
+
+
+docx_path = 'test.docx'  # Path to the Word document file.
+metadata = extract_metadata(docx_path)  # Call the extract_metadata function.
+pprint(metadata)  # Pretty print the metadata dictionary.
diff --git a/ethical-hacking/docx-metadata-extractor/requirements.txt b/ethical-hacking/docx-metadata-extractor/requirements.txt
@@ -0,0 +1 @@
+python-docx
diff --git a/ethical-hacking/docx-metadata-extractor/test.docx b/ethical-hacking/docx-metadata-extractor/test.docx

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# [How to Extract Metadata from Docx Files in Python](https://thepythoncode.com/article/docx-metadata-extractor-in-python)`